Urd 128 =rn[rc] (1284:64/sjze) * rb f28 

m[rc ]( 128*64/size) 
511 




rb(128) 



128 rd(l28) 0 



FIG. 2 



511 m[rc](128*64/size) n 




NMMM 

Nh+Jg+Ff+Be+xd+tc+pb+l a^ 




+ ) + ) ( + + 



Kh+Gg+Cf+ye+ud+qc+mb+io 



Mh+lg+Ef+Ae+wd+sc+ob+ko rd(128) 0 




FIG. 3 



Lh+Hg+Df+ze+ vd+rc+nb+ jo 



□ specifier=address+( size/2)+( width/2) 



_ 

depth = 4 bytes \ 



width = 16 bytesV ^A I 5 ** = depth * ^ = 64 ^ e Z_ 



address is aligned to size (64 bytes), 
so low- order 6 bits are zero 



address 
size/2 



aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 



000000 \ 



000000000000000000000000000000000 1 100000 | 



width/2 1 000000000000000000000000000000000 \ 001000 \ 



specifier 
500 



aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 



505 



101000 | 



FIG. 5 



510 



specifier 
600 



aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 



605 



615- 



101000 \ ^ 



610 



s and (0-s) I 

T 



width/2 000000000000000000000000000000000 001000 \ 
_ — T _> 

620 — 



t 



«5-sJ s and not (width /2)\ 



aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 



630 



5J5- 



j and ffl-f; | 



T 



size/2 \00OO0000O000O000000O0O00000O00000 100000 
______ _j 



640 



645- 



address 



t and not (size/2)\ 



aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa 



650 



000000 l 



FIG. 6 



700 

_^ | 

Register number j 

705 

A s 

Operand 
checker 




730A-H- 



725^ 



Wide operand specifier y ^710 



Memory 
Memory width- 



Register operand 



Register operand 



Portion 0 



Portion 1 



Portion 2 



Portion 3 



Portion 4 



Portion 5 



Portion 6 



zzzzzzz 



T Portion 7 



Function 



f 



Function unit with dedicated storage 



Result 
Register width\ - 



-715 



720A 



720n 



J 



714 



735 



^,-740 



Wide 
operand 



745 



FIG. 7 



nwmc.c contents 



□ wmc.pa-physical address 
nwmc.size-size of contents 
nwmc.c v-con ten ts valid 
Owmc.th- thread fast used 

□ wmc.reg-register last used 
Uwmc.rtv-register & thread valid 



FIG. 9 
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Operation codes 




W.SWITCH.B 


Wide switch big-endian 


W.SWITCH.L 


Wide switch little-endian 



Selection 



class 


op 


order 


Wide switch 


W.SWITCH 


B L 



Format 

W.op.order ra=rc,rd,rb 
ra=woporder(rc,rd,rb) 



31 24 23 18 17 12 11 6 5 0 

I W.op.order I rd | rc I rb I ra I 
8 6 6 6 6 



FIG. 12A 



^~ 1230 




FIG. 12B 



1250 



Definition 

defWideS wi tc h (o p, rd , rc , rb, ra) 
d-*-RegRead(rd, 128) 
c-*- RegRead(rc, 64) 
b-»- RegRead(rb, 128) 
if C1..0* 0 then 

raise AccessDisallowedByVirtual Address 
elseif c 6 o *0 then 

VirtAddr-*- c and (c-1) 

W-«—wsize-*-(c and (0-c))|| 0 1 

else 

VirAddr-^-c 
w-*— wsize 128 

endif 

msize-*— 8*wsize 
Iwsize-*— log(wsize) 
case op of 

W.SWITCH.B: 

order-*— B 
W.SWITCH.L: 

order L 

endcase 

m LoadMemory(c, VirtAddr.msize.order) 
db-»- d || b 
for i 0 to 127 

j 0|| ilwsize-1..0 

k ~*~~ [ n 7*w*jll m 6*w+jllni5' W 4jl|m4* w+ j||m3V4jllm2* w+ j||^ mj 

^ '7..1wsizell jlwsize-1..0 

a,-*—dbi 

endfor 

RegWrite(ra, 128, a) 
enddef 



FIG. 12C 



Exceptions 



Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 
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Operation codes 



W.TRANSLATE.8.B 


Wide translate bvtes biq-endian 


W.TRANSLATE.16.B 


Wide translate doublets bit-endian 


W.TRANSLATE.32.B 


Wide translate auadlets bit-endian 


W.TRANSLATE.64.B 


Wide translate octlets biq-endian 


W.TRANSLATE.8.L 


Wide translate bvtes little-endian 


W.TRANSLATE.16.L 


Wide translate doublets little-endian 


W.TRANSLATE.32.L 


Wide translate quadlets little-endian 


W.TRANSLATE.64.L 


Wide translate octlets little-endian 



Selection 



class 


size 


order 


Wide translate 


8 16 32 64 


B L 



Format 

W.TRANSLATE.size.order rd=rc,rb 
rd=wtransiatesizeorder(rc,rb) 

31 2434 1817 1211 65 21 0 

| W.TRANSLATE.orderl rd | rc | rb | Q ~J7z] 



sz-«— log(size) = 3 



FIG. 13A 



r 



1330 



vsize 



g size 



w size 



Wide translate: 16 entries by 64 bits 



FIG. 13B 



^1350 

Definition 

def Wide Translate(op,gsize,rd,rc,rb) 
c-*-RegRead(rc, 64) 
b-*-RegRead(rb, 128) 
Igsize-*— log(gsize) 

if c lgsize-4..0 * 0 tnen 

raise AccessDisallowedByVirtual Address 

endif 

if c 4..lgsize-3 * 0 then 

wsize-*-(c and (0-c)) || 0 3 
t-*-c and (c-1) 

else 

wsize-*— 128 
t-*-c 

endif 

I wsize-*— log(wsize) 
if tlwsize+4..lwsize-2 * 0 then 

msize-*-(tand (0-t))||0 4 
VirtAddr-*-tand (t-1) 

else 

msize-*-256*wsize 
VirtAddr-*-t 

endif 

case op of 

W. TRANSLATE. B: 

order-*— B 
W. TRANSLATE. L: 

order-*— L 

endcase 

m-*— LoadMemory(c,VirtAddr,msize,order) 

vsize-*— msize/wsize 

Ivsize-*— log(vsize) 

for i-*— 0 to 128-gsize by gsize 

j-*-((order=B)"vsize )A(b, vsize . 1+u ))Ni*ize+i, w ,, 2e . 1 ..o 

a gsize-1*i..i-^ — m j-^gsize-1..j 
endfor 

RegWrite(rd, 128, a) 
enddef 
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Exceptions 



Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 
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Operation codes 


1410 


W MUL MAT 8 R 


wiae muitiDiv matrix sianed byte biq-endian 


W Mill MAT ft 1 


Wide multiply matrix siqned byte little-endian 


W MUL MAT 1fi R 

ft •IVI vL.IVin 1 • i u,u 


vviae muitiDiv matrix siqned doublet biq-endian 


W MUL MAT 1fi I 

¥ ¥ . IVI \J L. . IVI r\ 1 . 1 U,L 


wiae multiply matrix siqned doublet little-endian 


W MUL MAT 32 R 


vviae muitiDiv matrix sianed auadlet biq-endian 


W Mill MAT 19 1 

• ¥ . IVI UL. IVI r\ 1 . 0 C. . L 


Wide muitiDiv matrix sianed auadlet little-endian 


W MUL MAT r a R 


vviae mumpiy matrix siqned complex byte biq-endian 


W MUL MAT C 8 L 


wiae muniDiv matrix sianed complex bvte little-endian 


W Ml II MAT P 1fi R 


Wide multiply matrix siqned complex doublet biq-endian 


W Mill MAT C 1fi I 
vv .IVI UL.IVln 1 .O. ID.L 


Wide multiply matrix siqned complex doublet little-endian 


W MUL MAT M ft R 

V w . IVI \J l_ . IVI r\ 1 . IVI . O . O 


wiae multiply matrix mixed-siqned byte biq-endian 


W MUI MAT M ft 1 

» ¥ . ivi \j l. . ivi i . ivi . o . L 


wiae multiply matrix mixed-sianed bvte little-endian 


W MUI MAT M 1fi R 

¥ ¥ . I VI \J L. . 1 VI 1 . 1 VI . 1 u.D 


Wide multiply matrix mixed-sianed doublet bia-endian 


W Ml II MAT M 1 R 1 

VV JVIUL.IVIM 1 .m. ID.L 


Wide multiply matrix mixed-sianed doublet little-endian 


W MUI MAT M^9R 
! v v . ivhj L . IVI r\ i . IVI . O c. . D 


.Wide multiply matrix mixed-sianed auadlet bia-endian 


W MUL MAT M ^9 1 

! ¥ ¥ . IVI VJ l_ . IVI f \ 1 . (VI . O C- . U 


wiae mumpiv matrix mixed-sianed auadlet little-endian 


W MUL MAT P ft R 

it •iviuL.ivin i .i .u.D 


wiae multiply matrix polynomial byte biq-endian 


W MUI MAT P ft 1 

¥ ¥ . IVI \J L- . IVI 1 . 1 . 0 . L. 


wiae multiply matrix polynomial byte little-endian 


W MUL MAT P 1fi R 


1/1/ | f4 A nft I ll4tr\l«« rv« >s a^L.^ > - - * | | iiii* 

wiae muittpiy matrix polynomial doublet biq-endian 


W MUI MAT P 1fi I 

v ¥ . ivi \j u . \ VI r\ 1 . r . 1 Q . L 


Wide multiply matrix polynomial doublet little-endian 


W MUL MAT P19R 

¥ ¥ . i vi w u . ivi rA I . i . O ^ . D 


Wide multiply matrix polynomial quadlet biq-endian 


W.MUL.MAT.P.32.L 


Wide multiply matrix polynomial auadlpt littlp-pndian 


W.MUL.MAT.U.8.B 


Wide multiply matrix unsianed bvte bia-endian 1 


W.MUL.MAT.U.8.L 


Wide multiply matrix unsianed bvte little-endian 


W.MUL.MAT.U.16.B 


Wide multiply matrix unsianed doublet bia-endian 


W.MUL.MAT.U.16.L 


Wide multiply matrix unsianed doublet little-endian 


W.MUL.MAT.U.32.B 


Wide multiply matrix unsianed auadlet bia-endian 


W.MUL.MAT.U.32.L 


Wide multiply matrix unsigned quadlet little-endian 



Selection 



class 


op 


type 


size 


order 


multiply 


W. MUL. MAT 


NONE MUP 


8 16 32 


B 
L 






C 


8 16 


B 
L 



Format 
W.op.size.order rd=rc,rb 
rd=wopsizeorder(rc,rb) 
31 2423 



1817 



1211 



W.MINOR.order 



rd 



8 

sz-*- log(size) - 3 



I 



65 



21 0 



rc 



I rb | W.op | szl 



FIG. 14A 



r 



1430 



128 



m[rc](128*64/size) 
511 




































































































































1 


1 ^Jl 1 


/ 


r 0 



127 



rb(128) 



rd(128) o 
Wide multiply matrix 



FIG. 14B 
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Definition f 
def mul(size,h,vs t v,i,ws,j)as 

mul^((vs&v S ize.1*i) h - size !|v size . uu ) *((ws&w S ize.H) h - sjze || w siz0 . 1+i , ) 
enddef ^ 

def c-*-PolyMultiply(size,a,b) as 

p[0]-«-0 2 * size 

for k-*-Q to size-1 

p[k+1]^p[kl * a k ? (0 size ' k || b || 0 k ) : 0 2#slze 

endfor 

c-*-p[size] 
enddef 

def WideMultipIyMatrixtmajor.op^size.rd/crb) 
d-«-RegRead(rd f 128) 
c^-RegRead(rc, 64) 
b-*-RegRead(rb,128) 
Igsize-*— log(gsize) 

*f c lgsize-4..0 * 0 then 

raise AccessDisallowedByVirtualAddress 

endif 

if C2..lgsize-3* 0 then 

wsize-«— (c and (0-c))|| 0 
t-+-c and (c-1) 

else 

wsize-*— 64 
t-*-a 

endif 

Iwsize^— log(wsize) 

*f tlwsize+e-lgsize.Jwsize-a * 0 then 
msize-*-(t and (0-t)) || 0 4 
VirtAddr-*-tand(t-1) 

else 

msize -*-128*wsize/gsize 
VirtAddr-*-t 

endif 

case major of 

W.MINOR.B: 

order^-B 
W.MINOR.L: 
order-*— L 

endcase 

FIG. 14D-1 



1480 



case op of 

M.MUL.MAT.U.8, W.MUL.MAT.U.16, W.MUL.MAT.U.32, 
W.MUL.MAT.U.64: 

ms-*-bs-«— 0 
W.MUL.MAT.M.8, W.MUL.MAT.M.16, W.MUL.MAT.M.32, 
W.MUL.MAT.M.64 

ms-*-0 

bs-»-1 

W.MUL.MAT.8, W.MUL.MAT.16, W.MUL.MAT.32, 
W.MUL.MAT.64, W.MUL.MAT.C.8, W.MUL.MAT.C.16, 
W.MUL.MAT.C.32, W.MULMAT.C.64: 

ms-*-bs-*-1 
W.MUL.MAT.P.8, W.MUL.MAT.P.16, W.MUL.MAT.P.32, 
W.MUL.MAT.P.64: 
endcase 

m LoadMemory(c,VirtAddr,msize, order) 
h -*-2*gsize 

for i -*-0 to wsize-gsize by gsize 
q[0J-«-0 2 *9 size 

for j-*— 0 to vsize-gsize by gsize 
case op of 

W.MUL.MAT.P.8, W.MUL.MAT.P.16, 
W.MUL.MAT.P.32, W.MUL.MAT.P.64: 
k i+wsize*j 8Jgsize 

q[j+gsize] qfj] A PolyMultiply(gsize,m k ^ sjze -i..k, 

bj+gsize-1..j) 

W.MUL.MAT.C.8, W.MUL.MAT.C.16, W.MUL.MAT.C.32, 

W.MUL.MAT.C.64: 

if (~i) & gsize = 0 then 

k-^i-G&gsize)+wsize*j 8 .. lgsize+1 
q(j+gsizeH— q[i] + mul(gsize,h,ms,m,k,bs,b,j) 

else 

k -«- i+gsize+wsize*j 8 ..|g S j Ze+1 
q(i+gsize]-*— q[i] = mul(gsize,h,ms,m,k,bs,b,j) 

endif 



FIG. 14D-2 
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W.MUL.MAT. 8, W.MUL.MAT.16, W.MUL.MAT 32 
W.MULMAT.64, W.MUL.MAT. M. 8, W.MUL.MAT M 16 
W.MUL.MAT. M. 32, W.MUL.MAT.M.64, W.MUL MAT U8 
W.MUL.MAT.U.16, W.MUL.MAT.U.32, W.MUL.MAT. U. 64 
q[i+gsize] q(i] + mul(gsize,h,ms,m,i+wsize* 

endfor k ^™- b *^ 

a 2*gsize-1+2*i..2*i^-q(vsize] 
endfor 

3l27..2*wsize^- 0 
RegWrite(rd, 128, a) 
enddef 



FIG. 14D-3 



Exceptions 



Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 
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Operation codes 



W.MULMAT.X.B 


Wide multiply matrix extract biq-endian 


W.MUL.MAT.X.L 


Wide multiply matrix extract little-indian 



Selection 



class 


op 


order 


Multiply matrix extract 


W.MUL.MAT.X 


B L 


Format 








W. op. order ra=rc,rd,rb 








ra=wop(rc,rd,rb) 

31 2423 


1817 


1211 65 


0 


i W. op. order j 


rd | 


rc | rb 7 


ra | 


8 


6 


6 6 


6 




FIG. 15A 
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r \extr 


r 

ac/, 


0 


extr 

i 


act/, 


r \extr 


ac/, 


r \extr 


ac/, 


r 


\extrac(/ 

r 1 \ 
I I 


Xextract/ 


\extract/ 
I I 


r | 
1 1 



127 



rd(128) 



28 ra(128) o 

Wide multiply matrix extract doublets 



FIG. 15C 



rc(64*128/size) 




rd(128) 



128 ra(128) 0 

Wide multiply matrix extract complex doublets 



FIG. 15D 



Definition 1e .„ n 
det muKsize.h.vs.v.i.ws.w.j) as ou 

enddef""" ((VS&Vsize - 1+i)h - size 'l v ^-i^-i) * ((ws&Wsize-H)h-size| Kjze _ H j} 

def WideMultiplyMatrixExtract(op,ra,rb ( rc,rd) 
d-«-RegRead(rd, 128) 
c-*-RegRead(rc, 64) 
b-*-RegRead(rb, 128) 
case b8..o of 
0..255: 

sgsize-»-128 
256..383: 

sgsize-«-64 
384..447: 

sgsize-*-32 
448..479: 

sgsize -«-1 6 
480..495: 

sgsize-*-8 
496..503: 

sgsize-*-4 
504..507: 

sgsize -^-2 
508. .511: 

sgsize -«-1 

endcase 

l-^bn 

m-^-bi2 

n-«-bi3 

signed-*-bi4 

if c 3 o * 0 then 

wsize-«-(c and (0-c)) || 0 4 

t-*-c and (c-1) 

else 

wsize-*- 128 
endif 

if sgsize < 8 then 

gsize-«-8 
elseif sgsize > wsize/2 then 

gsize-«-wsize/2 

else 



FIG. 15E-1 
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gsize-#-sgsize 

endif 

lgsize^-log(gsize 
lwsize-*-log(wsize) 

'f tlwsize+6-n-lgsize..lwsize-3 * 0 then 
msize-«-(t and (0-t)) ||0* 
VirtAddr^-t and (t-1) 

else 

msize 64*(2-n)*wsize/gsize 
VirtAddr^-t 
endif 

vsize -+-(1+n)*msize*gsize/wsize 

mm LoadMemory(c, VirtAddr,msize,order) 

Imsize^-log(msize) 

if (VirtAddr lmsize _ 4 0 * 0 then 

raise AccessDisallowedByVirtualAddress 
endif 

case op of 

W.MULMAT.X.B: 

orders B 
W.MULMAT.X.L: 

order-*- L 

endcase 
ms-*- signed 
ds signed A m 
as-*-signed or m 
spos^-(b8 o) and (2*gsize-1) 
dpos-#-(0|| b 2 3 .16) and (gsize-1) 
r-4-spos 

sfsize -«-(0|| b 31 24) and (gsize-1) 

tfsize-*- (sfsize = 0) or ((sfsize+dpos) > gsize) ? gsize-dpos : sfsize 
fsize -*-(tfsize + spos > h) ? h - spos : tfsize 
if (bio..9 = Z) & -'Signed then 
rnd F 

else 

rnd-*- bio.,9 
endif 



FIG. 15E-2 



1580 

for i -*-0 to wsize-gsize by gsize 

q[0] 02*gsize+7-lgsize 
for j 0 to vsize-gsize by gsize 
if n then 

if (-) & j & gsize = 0 then 

k-*- i-(j&gsize)+wsize*j 8Jgsize+1 
q[i+gsize]-*- q[i] + mul(gsize,h,ms,mm,k,ds,d,j) 

else 

k -«~ i+gsize+wsize*j 8 Jgsize+ i 

q[i+gsize]-*- q[i] - muKgsizeXms.mm.Ms.dj) 

endif 

else 

q[i+gsize]-«-q[i] = mul(gsize,h,ms,mm,i+j*wsize/gsize,ds,d l j) 

endif 
endfor 
p— q[128] 
case rnd of 

none, N: 

s^0 h -Mhp r ||pM 

s^O^llp^ 

F: 

s-*-0 h 

C: 

s^0h-r||ir 

endcase 

v^((ds&ph-i)||p) + (0||s) 

if (Vh..r+fsize= (as & v r + fs j 2e -i )h*1-r-fsizoj or not I then 

w — (as & v r+fsize .i)g^^fsize-dpos|| Vfsize Uf r || O^s 

else 

w-#-(s ? (v h ||-v9 size " d P° s " 1 ) : -J gsize-dpos ^ || Qdpos 

endif 

3size-1+i..i^~ W 
endfor 

a 127..wsize"*-0 
RegWrite(ra, 128, a) 
enddef 
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Exceptions 



Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 
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Operation codes 



W.MULMAT.X.I.8.B 


Wide multiply matrix extract immediate signed byte big-endian 


W.MUL.MAT.X.I.8.L 


Wide multiply matrix extract immediate signed byte little-endian 


W.MULMAT.X.I.16.B 


Wide multiply matrix extract immediate signed doublet big-endian 


W.MUL.MAT.X.I.16.L 


Wide multiply matrix extract immediate signed doublet little-endian 


W.MULMAT.X.I.32.B 


Wide multiply matrix extract immediate signed quadlet big-endian 


W.MULMAT.X.I.32.L 


Wide multiply matrix extract immediate signed quadlet little-endian 


W.MULMAT.X.I.64.B 


Wide multiply matrix extract immediate signed octlets big-endian 


W.MUL.MAT.X.I.64.L 


Wide multiply matrix extract immediate signed octlets little-endian 


W.MUL.MAT.X.I.C.8.B 


Wide multiply matrix extract immediate complex bytes big-endian 


W.MULMAT.X.I.C.8.L 


Wide multiply matrix extract immediate complex bytes little-endian 


W.MUL.MAT.X.I.C.16.B 


Wide multiply matrix extract immediate complex doublets big-endian 


W.MULMAT.X.I.C.16.L 


Wide multiply matrix extract immediate complex doublets little-endian 


W.MULMAT.X.I.C.32.B 


Wide multiply matrix extract immediate complex quadlets big-endian 


W.MUL.MAT.X.I.C.32.L 


Wide multiply matrix extract immediate complex quadlets little-endian 



Selection 



class 


op 


type 


size 


order 


wide multiply 
extract immediate 


W.MUL.MAT.X.I 


NONE 


8 16 32 64 


L B 


C 


8 16 32 


L B 



Format 

W.op.tsize. order rd=rc,rb, i 
rd=woptsizeorder(rc,rb,i) 
31 24 23 



18 17 



12 11 

zn 



6 5 4 32 0 



| W.op. order | 



rd 



rc 



rb 



sz 



sh 



8 



1 2 



sz-*- log(size) - 3 
assert size+3 > i > size-4 
sh-«- i - size 
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1023 m[rc](128*128/size) 



^extract/ , ^extract / ^extrac^ ] 



Xextract/ 



i r 



Extract/ 



\extrac^ / 



extract/ , r 



127 



rd(128) 



\extrac/ 



128 rd(128) o 

Wide multiply matrix extract immediate doublets 



FIG. 16B 



1660 




127 



rb(128) 



\extrac^ / 
I 



T 



\extract/ , r \extract/ , .N pxtracj /. , \extract /, , 



Xextract/ 



Xextract/ 



Xextract/ 



128 rd(1 28) 0 

Wide multiply matrix extract immediate complex doublets 
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Definition 

def mul(size,h,vs,v,i,ws,w t j) as 

mul^((vs&v S iz0.1-H) h " size ||Vsize-1 + i..i) * ((ws&w S i 2 e.H) h * sl2e l|w siZ Q. H< . j ) 
enddef 

def WideMultiplyMatrixExtractimmediatelop^ype.gsize^rd.rc^b.sh) 
c-*-RegRead(rc, 64) 
b-«-RegRead(rb, 128) 
lgsize-«-log(gsize) 
case type of 
NONE: 

if C| gs ize-4..0 *• 0 then 

raise AccessDisailowedBy VirtualAddress 
endif 

if c 3J g Si2 e-3 * 0 then 

wsize-*- (c and (0-c))||0 4 
t-*-c and (c-1) 

else 

wsize-*- 128 
t-*-c 

endif 

lwsize-*-log(wsize) 

if t|wsize+6-lg$ize..lwsize-3 * 0 then 

msize -«-(t and (0-t))||0 4 

VirtAddr-*-t and (t-1) 

else 

msize 1 28*wsize/gsize 
VirtAddr-«-t 

C: 

if C|gsize-4..o * 0 then 

raise AccessDisallowedByVirtualAddress 

endif 

if C3..|gsize-3 * ^ ^ en 

wsize -*-(c and (0-c)) || 0 4 
t-*-c and (c-1) 

else 

wsize -*-128 
t-*-c 

endif 

Iwsize^-log(wsize) 

if t|wsize+5-lgsize..lwsize-3 * 0 then 

msize-#-(t and (0-t))|| 0 4 



FIG. 16D-1 



VirtAddr-«- t and (t-1) ^^—1680 

else 

msize -*-64*wsize/gsize 
VirtAddr-^t 

endif 

vsize 2*msize*gsize/wsize 

endcase 
case of of 

W.MULMAT.X.I.B: 

order-*- B 
W.MULMATXI.L: 
orders- L 

endcase 

as-*-ms-«-bs-*-1 

m-«- LoadMemory(c,VirtAddr,msize, order) 
h (2*gsize) + 7 - lgsize-(ms and bs) 
r -*-gsize + (sh|||sh) 
for^-0 to wsize-gsize by gsize 
q[0]^— 0 2 *9size+7-lgsize 

for 0 to vsize-gsize by gsize 
case type of 
NONE: 

q[j+gsize] -*-q[i] + mul(gsize,h,ms,m,i+wsize # 
J8..lgsize,bs,b,j ) 

C: 

if (H) & j & gsize = 0 then 

k ^i-G&gsize)+wsize Vigsize+1 
q[j + gsize]-#- q[i] + muKgsize.h.ms.m.k.bs.bJ) 

else 

k^-i+gsize+wsize*j 8Jgsi2e+1 
q[j+gsizel^q[j] - muKgsize.h.ms.m.k.bs.bJ) 
endif 

endcase 
endfor 

p-«-q[vsize] 

0h-r|| -p r || pr-1 
v^-((as&p M )||p)-H(0||s) 
if (v h . .r+gsize = (as & v r+ g S j Ze _i )h+1-r-gsize then 

3gsize-1+i..i Vgsize-1+r..r 

else 

a gs ize-i+i..i-«- as ? (v h ||-v9 size " 1 ) : 1 gsize 

endif 
endfor 

3l27..wsize 0 
RegWrite(rd, 128, a) 
enddef FIG. 16D-2 



Exceptions 



Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 
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Operation codes 



W.MULMAT.C.F.16.B 


Wide multiply matrix complex floating-point half big-endian 


W.MUL.MAT.C.F.16.L 


Wide multiply matrix complex floating-point little-endian 


W.MUL.MAT.C.F.32.B 


Wide multiply matrix complex floating-point single big-endian 


W.MUL.MAT.C.F.32.L 


Wide multiply matrix complex floating-point sinqle little-endian 


W.MUL.MAT.F.16.B 


Wide multiply matrix floating-point half big-endian 


W.MUL.MAT.F.16.L 


Wide multiply matrix floating-point half little-endian 


W.MULMAT.F.32.B 


Wide multiply matrix floating-point single big-endian 1 


W.MULMAT.F.32.L 


Wide multiply matrix floating-point single little-endian 


W.MULMAT.F.64.B 


Wide multiply matrix floating-point double big-endian 


W.MULMAT.F.64.L 


Wide multiply matrix floating-point double little-endian 



Selection 



class 


op 


type 


prec 


order 


wide multiply matrix 


W.MULMAT 


F 


16 32 64 


L B 


C.F 


16 32 


L B 



Format 

W. op. prec. order rd=rc,rb 
rd=wopprecorder(rc,rb) 

31 24 23 18 17 12 11 6 5 21 0 

W.MINOR.order I rd | re | rb I W.op | pr [ 
8 6 6 6 4 2 

Pr-*- log(prec) - 3 



FIG. 17 A 



023 m[rc](128*128/size) 



I 



I 



127 



rb(128) 



128 rd(128) 0 

Wide multiply matrix floating-point half 



FIG. 17 B 



511 rc(64*128/size) 




Wide multiply matrix complex floating-point half 



FIG. 17 C 



r. — 1780 

Definition ^ 

def mul(size,v,i,w,j) as 

mul^fmul(F(size,v S i Z e-i + i..i) > F(size > w S j ze .i +j ..j)) 
enddef 



def WideMultiplyMatrixFloatingPointtmajor.op.gsize.rd.rcrb) 
c-«- RegRead(rc, 64) 
b-*-RegRead(rb, 128) 
lgsize-*-log(gsize) 
switch op of 

W.MUL.MAT.F.16, W.MUL.MAT.F.32, W.MUL.MAT.F.64: 
if c lgsize-4..0 * 0 then 

raise AccessDisallowedByVirtualAddress 
endif 

if C3..ig S j Ze -3 * 0 then 

wsize -#-(c and (0-c))|| 0 4 
t-»-c and (c-1) 

else 

wsize -«-128 
t-«-c 

endif 

lwsize-+-log(wsize) 

if t lwsize+6-lgsize..lwsize-3 * 0 then 

msize-*-(t and (0-t))||0 4 

VirtAddr-*- 1 and (t-1) 

else 

msize -«-128*wsize/gsize 
VirtAddr-«-t 

endif 

vsize-*— msize*gsize/wsize 
W.MUL.MAT.C.F.16, W.MUL.MAT.C.F.32, W.MUL.MAT.C.F.64: 
'f c lgsize-4..0 * 0 then 

raise AccessDisallowedByVirtualAddress 

endif 

ifc 3..lgsize-3''0t h en . 
wsize-«-(c and (0-c))|| 0 4 
t-*-c and (c-1) 

else 

wsize -^128 
t-«-c 

endif 

lwsize-#-log{wsize) 

if flwsize-t-5-lgsize..lwsize-3 * 0 then 



FIG. 17D-1 



— 1780 

msize-*- (t and (0-t))|| 0 4 
VirtAddr-«-t and (t-1) 

else 

msize-*-64*wsize/gsize 
VirtAddr-»-t 

endif 

vsize -*-2*msize*gsize/wsize 

endcase 
case major of 
M.MINOR.B: 

order-*- B 
M.MINOR.L: 
order-*- L 

endcase 

m LoadMemory(c,VirtAddr,msize,order) 
for i-*-0 to wsize-gsize by gsize 
q[0].t-»-NULL 

for j 0 to vsize-gsize by gsize 
case op of 

W.MULMAT.F.16, W.MUL.MAT.F.32, W.MUL.MAT.F.64: 
q[j+gsize]-*-faddq[j], mul(gsize,m,i+wsize* 

J8..lgsize+1 - D 'j)) 
W.MUL.MAT.C.F.16, W.MUL.MAT.C.F.32, 

W.MUL.MAT.C.F.64: 

if (~i) & j & gsize = 0 then 

k i-(j&gsize)+wsize*j 8 Jgsize ^i 

qfj+gsize] faqq[j], mul(gsize,m,k,b,j)) 

else 

k-*- i+gsize+wsize*j 8 ..igsize+i 
qfj+gsize] -»-fsubq[j], mul(gsize,m,k,b,j)) 
endif 

endcase 
endfor 

agsize-1+i..i-*- q[vsize] 
endfor 

3l27..wsize"*~ 0 

RegWrite(rd, 128, a) 
enddef 
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Exceptions 



Floating-point arithmetic 
Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 



FIG. 17E 



1810 



Operation codes 



W.MUL.MAT.G.8.B 


Wide multiply matrix Galois bytes bkj-endian 


W.MUL.MAT.G.8.L 


Wide multiply matrix Galois bytes little-endian 



Selection 



class 


op 


size 


order 


Multiply matrix Galois 


W.MUL.MAT.G 


8 


B L 


Format 










W.op.order ra=rc,rd,rb 










ra=woporder(rc,rd,rb) 










31 24 23 18 17 


12 11 


6 


5 0 


W.op.order 


rd rc 


rb 


I 


ra | 


8 


6 6 


6 




6 



FIG. 18A 
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2047 



m|rc](128*128/srze) 
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F 
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r 


0 

> 


iod 

y 




r \moc 


lule/^ 


Amodule/, 


Amod 


ule/, 


r \rnod 


ule/, 


r \module/ i 


r \module/^ 


f \mo< 




\module/ 

f V } 

r J — i— 1 


\module/ \mod 

T " 1 — 1 — 1 — 1 — 1 


ule/ 

r i 


Xmodule/ 

- .K- 


Xmodule/ 


\moc 

r i 


[ule/ 


\moc 


lule/ 


\mod 

- . 


ule, 



127 



rd(128) 



128 



Wide multiply matrix Galois byte 

FIG. 18B 



Definition 

def c-*-PolyMultiply(size,a,b) as 

p[0]-*-0 2 * si " 

for k-«— 0 to size-1 

p[k+1]-*-p[k] A a k ? (0 size " k ||b|| 0 k ) : 0 2 * size 

endfor 

c-*-p[size] 
enddef 

def c-*-PolyResidue(size,a,b) as 
P(0] a 

for k-^— size-1 to 0 by-1 

p[k-1]— p[k] * p[0] si2e+k ?(0 size - k |l b|| 0 k ) 
endfor 

c-#-p[size] S i Ze -i..o 
enddef 

def WideMultiplyMatrixGaloistop^size.rd/crb^a) 
d-«-RegRead(rd, 128) 
c-«-RegRead(rc, 64) 
b-*-RegRead(rb,128) 
lgsize-*-log(gsize) 
if C|gsjze-4..0 * 0 then 

raise AccessDisallowedByVirtualAddress 

endif 

if c 3 ..igsize-3 * 0 then 

wsize (c and (0-c)) || 0 4 
t-*-c and (c-1) 

else 

wsize -#-128 
t-«-c 

endif 

Iwsize^-log(wsize) 

if t|wsize+6-lgsize..lwsize-3* 0 then 

msize-«-(t and (0-t)) || 0 4 

VirtAddr-«-t and (t-1) 

else 

msize-*-128*wsize/gsize 
VirtAddr-*— t 

endif 

case op of 

W.MULMAT.G.8.B: 

orders- B 
W.MULMAT.G.8.L: 

order -*-L 

endcase fjQ fQQ^ 



1860 



m^LoadMemory(c, VirtAddr.msize.order) 
for i-*— 0 wsize-gsize by gsize 
q[01-*-()2*gsize 

for 0 to vsize-gsize by gsize 
k-*- i+wsize*j 8 ..igsize 

q[j+gsize]-«-q[j] A PolyMultiply(gsize,m k+gsjze . 1 .. k ,dugsl»-l..| ) 
endfor 

agsize-1-n..i -«-PolyResidue(gsize,q(vsize],bgsize-i..o ) 
endfor 

a 127..wsize-*- 0 

RegWrite(ra,128, a) 
enddef 



FIG. 18C-2 



Exceptions 



Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 
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Operation codes 



E.MUL.ADD.X 


Ensemble multiply add extract 


E.CON.X 


Ensemble convolve extract 1 


Format 




E.op rd@rc,rb,ra 




rd=gop(rd,rc,rb,ra) 




31 24 


23 18 17 12 11 6 5 0 


I E.op | 


,(J I rc I rb | ra I 


8 


6 6 6 6 



FIG. 19A 
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Figures 19B and 20B has blank fields: should be. 



| dpos | x | s | n |m| l| rnd 



fsize 



gssp 



FIG. 19B 



1930 



127 



rc(128) 



127 



' \extract/ i \extrac{/ ' \extrac{/ \extra<V 



I r 



\extract/ , , Vxtract /, , \exfract /, \extf*a<V , r 



rb(128) 



± 0 



i i i i ~r~r 



128 rd(128) 0 

Ensemble multiply add extract doublets 



FIG. 19C 



r 



1945 



127 





rc(128 



\extract/ , F \extra<V , 



1 ^ Xextract /' \extrac| / ] S pxtrac^ \extract / 



I 



I 




I 




t \extracy / f \extract/ , r 



127 



rb(128) 



I 



111 



128 rd(128) 0 

Ensemble complex multiply add extract doublets 

This ensemble-multiply-add-extract instructions (E.MULADD.X), when 
the x bit is set, multiply the low-order 64 bits of each of the rc and rb 
registers and produce extended (double-size) results. 



FIG. 19D 



1960 



255 



rc II rd (256) 




Xextrac^ / 



\extract/ i Ny extrac^ r \extracl / r \extrac^ / i f 



\extrac^ / 



J 



\extracy / 



I 



\extrac^ 



128 



rd(128) 0 

Ensemble convolve extract doublets 



(128) 



FIG. 19E 



1975 



255 



rc II rd (256) 




\extract/ , , \extract/ , f \extract/ i r \extract/ . f 



\extract/ \extracy \extract / \extrac^ 



III 



128 



I I I I 



rd(128) 0 
Ensemble convolve extract complexdoublets 



(128) 



FIG. 19F 



Definition ^_1990 
def muKsize.h.vs.v.i.ws.w.j) as ' 

muh*- ((vs&v siZ e-i + i)h-size||v size . 1+j J * ((ws&w size -lH) h - size ||w s j Z9 . H j) 



enddef 



def EnsembleExtractlnplacefop.ra.rb.rc.rd) as 
d-*-RegRead(rd, 128) 
c-*-RegRead(rc, 128) 
b-*-RegRead(rb, 128) 
case b8..o of 
0..255: 

sgsize -«-1 28 
256.. 383: 

sgsize-»-64 
384.. 447: 

sgsize -+-32 
448..479: 

sgsize -*-16 
480..495: 

sgsize -«-8 
496.. 503: 

sgsize-*-4 
504.. 507: 

sgsize -*-2 
508.. 511: 

sgsize -*-1 

endcase 

l-*-an 

m-#-ai2 

n-«-ai3 

signed-*-ai4 

x-«-a 15 

case op of 

E.CON.X: 

if (sgsize < 8) then 

gsize-*-8 
elseif (sgsize*(n-1)*(x+1) > 128 then 
gsize-«-128/(n-1)/(x+1) 

else 

gsize-*- sgsize 

endif 

lgsize-*-log(gsize) 
wsize 128/(x+1) 



FIG. 19G-1 



vsize -*-128 ^-1990 



ds-«-cs-«- signed 

bs-*- signed A m 

zs signed or m or n 

zsize -«-gsize*(x+1) 

h-«- (2*gsize) + log(vsize) - Igsize 

spos-+- (a 8 ..o) and (2*gsize-1) 



E.MUL.ADD.X: 

if(sgsize < 9) then 

gsize^-8 
elseif (sgsize*(n+1)*( x +1) > 128) then 

gsize-*-128/(n+1)/(x+1) 

else 

gsize-*-sgsize 
endif 

ds-*- signed 
cs-*- signed A m 
zs-*- signed or m or n 
zsize-*- gsize*(x+1) 
h-*- (2*gsize) + n 
spos-*-(a 8 0 ) and (2*gssize-1) 
endcase 

dpos-«-(0|| a 2 3. 16 ) and (zsize-1) 
r-«-spos 

sfsize -*-(0|| a 31 24 ) and (zsize-1) 

tfsize-*- (sfsize = 0) or ((sfsize+dpos) > zsize) ? zsize-dpos : sfsize 
fsize -*-(tfsize + spos > h) ? h - spos : tfsize 
if (bio 9 = Z) and not as then 
rnd-«-F 

else 

rnd-*- bio.,9 
endif 



FIG. 19G-2 



* 1990 

for k 0 to wsize-zsize by zsize 
i-«-k*gsize/zsize 
case op of 
E.CON.X: 
q[0]— 0 

for \~+- 0 to vsize-gsize by gsize 
if n then 

if(~) & j & gsize = 0 then 

q[j+gsize]-+- q[j] + mul(gsize,h 1 ms 1 m 1 i+ 
128-j,bs,b,j) 

else 

q[j+gsize]-*-q(j] - mul(gsize,h,ms,i+ 
128-j+2*gsize,bs,b,j) 
endif 

else 

qfj+gsize] -*-q[j] + mul(gsize,h,ms,m,i+ 
128-j,bs,b,j) 

endif 
endfor 

p -#-q[vsize) 
E.MUL.ADD.X: 

di -*-((ds and dk+zize-1 )h-zsize-r|| (d k+2S j 2 e-1..k )|| 0 r ) 
if n then 

if ( i and gsize) = 0 then 

p muKgsize.h.ds.d.i.cs.c.i)- 
mul(gsize,h,ds,d,i+gsize,cs,c,i+gsize)+di 

else 

p^muKgsize.h.ds.d.i.cs.c.i+gsizeJ-HTiuKgsize.h.ds.d.i.cs.ci-^sizeJ+di 

endif 

else 

p-*- mul(gsize,h,ds,d,i,cs,c,i) + di 

endif 

endcase 



FIG. 19G-3 



case rnd of 

N: 

^ O h - r ||-p r ||pr-l 
R s — OMI pr , 
s-»-0 h 

C: 

s O h " r lh r 

endcase 

v 7-(( zs &Ph.i)HP)*(0||s) 

cir h n !;^!" (ZS & V ^size-i)b + i-f-fsize) or not (I and (op = 
EXTRACT)) then 

else W ^ (ZS & Vr+,Si2e - ,)ZSiZe " <SiZe " dP0S ll V fsize-1 + r..rll 0 d P°* 
W-»-(zs ? (Vh||~v£ size - d P°s-1) • izsize-dposy|| 0 dpos 



endif 

Zzsize-1_k..k-*- w 
endfor 

RegWrite(rd, 128, z) 
enddef 



FIG. 19G-4 
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Operation codes 
E.MUL.X 



Ensemble multiply extract 



E. EXTRACT 



Ensemble extract 



E.SCAL.ADD.X 



Ensemble scale and extract 



Format 

E.op ra=rd,rc,rb 
ra=eop(rd,rc,rb) 

31 24 23 



18 17 



12 11 



6 5 



:£P_ 



8 



I 



rd 



rc 



rb 



ra 



FIG. 20A 




FIG. 20B 



r 



2030 




\extract, 



, , \extrac{/ , 



\extract / 



rc(128) 



\extrac^ / 



I I I I 



T 



, \extracy / 1 p \ pxtrac</ , , 



w extracy / 



\extracy / 



r 



III 



128 ra(128) 0 

Ensemble complex multiply extract doublets 

This ensemble-multiply-extract instructions (E.MULX). when 

the x bit is set, multiply the low-order 64 bits of each of the rc and rb 

registers and produce extended (double-sire) results. 



FIG. 20D 



2020 



127 



rd(128) 



127 




t I _ , i j u 



Vextract/ \extr'act/ 



128 



ra(128) 



rc(128) 




FIG. 20C 



r 



2040 



127 



127 



rd(128) 
rc(128) 



\extract/ 



Vxtract/ 



\ex tract/ 



■ 95 
- 80 

0 rb(128) 
"79 

. 64 



1 28 ra(128) 0 

Ensemble scale add extract doublets 



FIG. 20E 



r 



2050 




\extrac^ / 



\extract/ , r \extracy /, r \extrac^/ , Xextracj/ , r 



Xextract/ 



\extrac^ / 



i i i i 



\extrac^ / 



128 



ra(128) 



Ensemble complex scale add extract doublets 

The ensemble-scale-add-extract instructions (E.SCLADD.X), when the x bit 
is set, multiply the low-order 64 bits of each of the rd and re registers by the 
rb register fields and produce extended (double-size) results. 
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t fsize spoc 





1 
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~4 


2* gsize /\ 








rd||rc 



s 


ab 


■ — ► 

0 




fsize 


dpos 



Ensemble extract 

FIG. 20G 



St 



gsize 



rd 



5: 



fsize 



Ensemble merge extract 
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St 




rd H 






gsize 






/ \ \ 


s 


a 


o 1 



fsize ^ ^ dpos 



Ensemble expand extract 

F/G. 20/ 



Definition 2090 
def muKsize.h.vs.v.i.ws.w.j) as ^^-^usu 

enddef"^ ((vs&v -«-^*) h - size ll * ((ws&w S ize- H )h-size|| Wsize . H . , 



def EnsembleExtractfop.ra.rb.rc.rd) as 
d-*-RegRead(rd ( 128) 
c-«-RegRead(rc, 128) 
b-*-RegRead(rb, 128) 
case bs o of 
0..255: 

sgsize-*-128 
256..383: 

sgsize-*-64 

384. .447: 

sgsize-«-32 
448. .479: 

sgsize-«-16 
480..495: 

sgsize-*-8 
496.. 503: 

sgsize-«-4 
504.. 507: 

sgsize-*-2 
508.511: 

sgsize-«-1 

endcase 

l-*-bii 

m-*-bi2 

n-«-bi3 

signed-*- bi4 

x-*-bi5 

case op of 

E. EXTRACT: 

gsize sgsize*2(2-(m or x)) 

zsize-*— sgsize 

h-<- gsize 

as-*- signed 

spos-*-(be..o) and (gsize-1) 



FIG. 20J-1 




if (sgsize < 8) then 

gsize-*-8 
elseif (sgsize*(n+1) > 32) then 

gsize-*- 32/(n+1) 

else 

gsize-*- sgsize 
endif 

ds-«- cs-*- signed 
bs-*- signed A m 
as-*- signed or m or n 
zsize -*-gsize*(x+1) 
h-*-(2*gsize) + 1 + n 
spos -*-(bs o) and (2*gsize-1) 
E.MUL.X: 

if (sgsize < 8) then 

gsize-*- 8 
elseif (sgsize*(n+1)*(x+1) > 128) then 

gsize-*-128/(n+1)/(x+1) 

else 

gsize-*- sgsize 

endif 

ds -^-signed 

cs-^- signed A m 

as signed or m or n 

zsize -*-gsize*(x+1) 

h-*-(2*gsize) + n 

spos-«-(b8..G) and (2*gsize-1) 

endcase 

dpos-+-(0|| b 2 3..i6) and (zsize-1) 
r -*-spos 

sfsize -*-(0|| b3i..24) and (zsize-1) 

tfsize -*-(sf size =0) or ((sfsize-nJpos) > zsize) ? zsize-dpos : sfsize 
fsize (tfsize + spos > h) ? h - spos : tfsize 
if (bio 9=Z) and not as then 
rnd-^-F 

else 

rnd-*- b 

endif 



FIG. 20J-2 



for j-*-Q to 128-zsize by zsize ^-2090 
i-«- j'gsize/zsize 
case op of 

E. EXTRACT: 
if m or x then 

p-*- dgsize-H-1..i 

else 

p-#- (d|| C)gsize+j-1..i 
endif 
E.MULX: 
if n then 

if (i and gsize) = 0 then 

p-*-mul(gsize,h f ds,d,i,cs,c,i)- 
muKgsize.h.ds.dj+gsize.cs.c.i+gsize) 

else 

mul(gsize ( h,ds,d,i J cs l c ( i+gsize)+mul(gsize,h,ds,d,i,cs > c,i+gsize) 

endif 

else 

p muKgsize^.ds.dJ.cs.c.i) 
endif 
E.SCALADD.X: 
if n then 

if (i and gsize) = 0 then 

p mul(gsize,h t ds,d,i,bs, b,64+2 # gsize) 
+ mul(gsize f h,cs,c,i,bs,b,64) 
• mul(gsize,h,ds ( d l i+gsize l bs,b,64^3 # gsize) 
- mul(gsize t h t cs l c t i+gsize,bs f b t 64+gsize) 

else 

p mul(gstze l h,ds,d,i,bs > b t 64+3*gsize) 
v + mul(gsize Acs,c,i,bs t b,64+gsize) 

+ mul(gsize,h,ds t d^size,bs,b f 64+2*g$ize) 
+ mul(gsize,h,cs l c J i+gsize,bs,b f 64) 

endif 

else 

p+ mul(gsize t h,ds,d t i,bs,b l 64+gsize) + mul(gsize 
,h,cs,c,i,bs,b,64) 

endif 

endcase 
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C3Se N rnd0f ^-2090 
S — 0^||. Pr || pr-1 

l s -o h ' r k, 

s-*-0 h 

C: 

s 0 hr |l 1 r 

endcase 

y -^-((as & p h -i)llp) + (Oils) 

'f (Vh..Msize= OS & Vr.fsize-O^I-r-fsize) or nQt () and , = 
E. EXTRACT)) then 

else W ^ & Vr+fsize=l)ZSiZe * fSiZe " dP0S ll v fsize-1 + r..rll 0 d P°* 

W -*-(s ? (vh|| ~v£ size - d P<>s-1) • ^size-dposjn Qdpos 
endif " 

if m and (op = E. EXTRACT) then 

Zzsize-H..j ^- Casize-1 + j..dpos+fsize*j||Wd P os+fsize-l..dpos|| 
else dpos - 1+j - j 

z Z size-1+j..j-#-w 
endif 
endfor 

RegWrite(ra, 128, z) 
enddef 
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\ 
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Typical dynamic-linked, inter 
caller: 



-gateway calling sequence: 



caller AA.DDI 
S.I.64.A 
S.I.64.A 

L.I.64.A 
L.I.64.A 
B.GATE 
LI.64.A 

..(code using dp) 
LI.64.A 

A. ADDI 
B 

callee (non-leaf): 

calee: LI.64.A 
S.I.64.A 
LI.64.A 
S.I.64.A 
S.I.64.A 
...(using dp) 
LI.64.A 

..(code using dp) 
LI.64.A 
L.I.64.A 

B. DOWN 

callee (leak, no stack): 

callee: ...(using dp) 
B.DOWN 



sp@-size 

Ip.sp.off 

dp.sp.off 

lp=dp,off 
dp=dp,off 

dp.sp.off 

lp=sp,off 

sp=size 

IP 



dp=dp,off 

sp.dp.off 

sp=dp,off 

Ip.sp.off 

dp.sp.off 

dp.sp.off 

lp=sp,off 
sp=sp,off 
lp 



// allocate caller stack frame 



// load lp 
// load dp 



// restore original lp register 
// deallocate caller stack frame 
// return 



// load dp with data pointer 
// new stack pointer 



// restore original lp register 
// restore original sp register 



lp 



FIG. 21 B 
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Operation codes 


B.GATE 


Branch gateway | 


Equivalencies 


B.GATE 


B.GATE 0 | 


Format 




B.GATE rb 




bgate(rb) 




31 24 


23 18 17 12 11 6 5 0 


I B. MINOR 


u I 1 | rb | B.GATE I 


8 


6 6 6 6 



FIG. 21 C 



rc=1 




data 



Branch gateway 



FIG. 21 D 
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Definition 

def BranchGateway(rd,rc,rb) as 
c <- RegRead(rc t 64) 
b «- RegRead(rt), 64) 
if (rd*0) or (re* 1)then 

raise Reservedlnstnjction 

endif 

if C2..0 * 0 then 

raise AccessDisallowedByVirtualAddress 

endif 

d <- ProgramCounter 63> . 2 +l || PrivilegeLevei 
if PrivilegeLevei < 0 then 

m <— LoadMemoryG(c,c 1 64,L) 
if b * m then 

raise GatewayDisallowed 

endif 

PrivilegeLevei <- b^o 

endif 

ProgramCounter <- bg3 2 It 0 2 
RegWrite(rd. 64, d) 
raise TakenBranch 
enddef 



FIG. 21E 



Exceptions 



Reserved Instruction 
Gateway disallowed 
Access disallowed by virtual address 
Access disallowed by tag 
Access disallowed by global TB 
Access disallowed by local TB 
Access detail required by tag 
Access detail required by local TB 
Access detail required by global TB 
Local TB miss 
Global TB miss 



FIG. 21 F 
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Operation codes 



E.SCAL.APD.F.32 



E^SCALADD.F.64 



Ensemble s cale add floating-p oint half 



Ensemble scale add floating-point sing le 



Ensemble scale add floating-point double 



Selection 




Format 

E.op.prec ra=rd,rc,rb 
ra=eopprec(rd,rc,rb) 

31 24 23 

1 E.op.prec | 

8 



18 17 



12 11 



6 5 



rd 



re 



rb 



ra 



FIG. 22A 



Definition 



def EnsembleFloatingPointTemaryCop^rec^d.rc.rb.ra) as 
d <- RegRead(rd, 128) 
c <- RegRead(rc, 128) 
b <- RegRead(rt>, 128) 
for i 4- 0 to 128-prec by prec 

di <- F(prec t d| +pre o-1..i) 

ci +- F(prec t q+ pre oi..i) 

ai <- fadd(fmul(di, F(prec,b pre c-1..0)). fmul(ci t F(prec,b2* pr ec-i..prec))) 
aj+prec-L.i «- PackF(prec, ai, none) 
endfor 

RegWrite(ra, 128. a) 
enddef 



FIG. 22B 



Operation codes 




Selection 



operation 


function (binary) 


function (decimal) 


d 


11110000 


240 


c 


11001100 


204 


b _ 


10101010 


176 


d&c&b 


10000000 


128 


(d&c)|b 


11101010 


234 


d|c|b 


11111110 


254 


d?c:b 


11001010 


202 


d A c*b 


10010110 


150 


~d A c A b 


01101001 


105 


0 


00000000 


0 



Format 

G. BOOLEAN rd@trc,trb,f 
rd=gbooleani(rd,rc,rb,f) 

21 252423 18 i 7 12 11 6 ' 

I G.BOOLEAN || h | rd | rc I rh [ 

7 1 ~ r — i 



FIG. 23A 



if f6=f5 then 

if f2=f 1 then 

if f2 then 

rc <r- max(trc,trb) 
rb +- min(trc f trb) 

else 

rc min(trc f trb) 
rb «- max(trc,trb) 

endif 
ih<-0 

■> <- 0 || f 6 || f7 || f4 II f3 II f0 

else 

if f2 then 

rc <- trb 
rb <r- trc 

else 

rc <- trc 
rb <- trb 

endif 
ih<-0 

jl <~ 1 II f6 II f7 II f4 II f3 II fO 

endif 

else 

ih <r~ 1 

if f6 then 

rc <- trb 
rb <— trc 

ll *- fl II f2 II f7 II f4 II f3 II fO 

else 

rc <r- trc 
rb <- trb 

'"<-f2l|fl II f 7 II f 4 II f3 II fO 

endif 

endif 



F/G. 23B 



Definition 

def GroupBoolean (ih,rd,rc, rto.il) 
d +- RegRead(lrd f 128) 
c <- RegRead(rc, 128) 
b <- RegRead(rt). 128) 
if ih=0 then 

if ils=0 then 

f^ i»3 II iU II «4 II "2 II «1 II (rOft)2||H 0 

else 

f <— ^3 II i»4 II «4 II 02 II »1 II 0 || 1 II Ho 

endif 

else 

f <- il 3 || 0 || 1 || il 2 II Hi II its II iU II i'0 

endif 

for i <- 0 to 127 by size 

as *- f(di||cillbi) 
endfor 

RegWrite(rd, 128, a) 
enddef 



FIG. 23C 
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Operation codes 
B.HINT | Branch Hint 

Format 

B.HINT badd.count.rd 
bhint(badd.count,rd) 

31 24 23 1817 1211 65 

I BJV1INOR I rd | count | simm | B.HINT 

8 6 6 6 6 

simm <— badd-pc-4 



FIG. 24A 
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Definition 

def BranchHint(rd t countsimm) as 
d <- RegRead(rd t 64) 
if (di..o)*Othen 



enddef 



raise AccessOisallowedByVirtualAddress 

endif 

FettHintlProgramCounter .4 * (0 1| simm || 0 2). ^ || 



FIG. 24B 



Exceptions 

Access disallowed by virtual address 



FIG. 24C 
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Operation codes 



E.SINK.F.16 


Ensemble convert floating-point 


doublets from half nearest default 


E.SINK.F.16C 


Ensemble convert floatinq-point 


doublets from half ceilinq 


E.SINK.F.16.C.D 


Ensemble convert floatinq-point 


doublets from half ceiling default 


E.SINK.F.16.F 


Ensemble convert floatinq-point 


doublets from half floor 


E.SINK.F.16.F.D 


Ensemble convert floatinq-point 


doublets from half floor default 


E.SINK.F.16.N 


Ensemble convert floatinq-point 


doublets from half nearest 


E.SINK.F.16.X 


Ensemble convert floatinq-point 


doublets from half exact 


E.SINK.F.16.Z 


Ensemble convert floatinq-point 


doublets from half zero 


E.SINK.F.16.Z.D 


Ensemble convert floating-point 


doublets from half zero default 


E.SINK.F.32 


Ensemble convert floatinq-point 


quadlets from sinqle nearest default 


E.SINK.F.32.C 


Ensemble convert floatinq-point 


quadlets from single ceiling 


E.SINK.F.32.C.D 


Ensemble convert floatinq-point 


quadlets from sinqle ceilina default 


E.SINK.F.32.F 


Ensemble convert floating-point 


quadlets from single floor 


E.SINK.F.32.F.D 


Ensemble convert floatinq-point 


quadlets from sinqle floor default 


E.SINK.F.32.N 


Ensemble convert floating-point 


quadlets from single nearest 


E.SINK.F.32.X 


Ensemble convert floating-point 


quadlets from single exact 


E.SINK.F.32.Z 


Ensemble convert floatinq-point 


quadlets from sinqle zero 


E.SINK.F.32.Z.D 


Ensemble convert floatinq-point 


quadlets from sinale zero default 


E.SINK.F.64 


Ensemble convert floating-point 


octlets from double nearest default 


E.SINK.F.64.C 


Ensemble convert floating-point 


octlets from double ceiling 


E.SINK.F.64.C.D 


Ensemble convert floatinq-point 


octlets from double ceilina default 


E.SINK.F.64.F 


Ensemble convert floating-point 


octlets from double floor 


E.SINK.F.64.F.D 


Ensemble convert floatinq-point 


octlets from double floor default 


E.SINK.F.64.N 


Ensemble convert floatinq-point 


octlets from double nearest 


coir«i\.r.o*t.A 


cnsemoie convert floatinq-poini 


octlets from double exact I 


E.SINK.F.64.Z 


Ensemble convert floatinq-point 


octlets from double zero 


E.SINK.F.64.Z.D 


Ensemble convert floatinq-point 


octlets from double zero default 


E.SINK.F.128 


Ensemble convert floating-point 


hexlet from quad nearest default 


E.SINK.F.128X 


Ensemble convert floating-point 


hexlet from quad ceiling 


E.SINK.F.128.C.D 


Ensemble convert floatinq-point 


hexlet from quad ceiling default 


E.S1NK.F.128.F 


Ensemble convert floating-point 


> hexlet from quad floor 


E.SINK.F.128.F.D 


Ensemble convert floating-point 


hexlet from quad floor default 


E.SINK.F.128.N 


Ensemble convert floatinq-point 


hexlet from quad nearest 


E.SINK.F.128.X 


Ensemble convert floatinq-point 


hexlet from quad exact 


E.SINK.F.128.Z 


Ensemble convert floatinq-point 


, hexlet from quad zero 


E.SINK.F.128.Z.D 


Ensemble convert floating-poim 


hexlet from quad zero default 



FIG. 25A-1 



Selection 




Format 



E.SINFCRprec.rnd rd=rc 

rd=esinkfprecrnd(rc) 

31 24 23 



E.prec 



18 17 



rd 



12 U 



6 5 



rc |E.SINHF.rnd| E.UNARY I 



FIG. 25A-2 
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Definition 

def EnsemleSinkFloatingPoint(prec,round,rd,rc) as 
c-«-RegRead(rc, 128) 
for h*— 0 to 128-prec by prec 
ci-^-F(prec I c i+prec . 1 . j) 

a i+prec-i..i^*— fsinkr(prec, ci, round) 
endfor 

RegWritefrd, 128, a) 
enddef 



FIG. 25B 



Exceptions 

Floating-point arithmetic 



FIG. 25C 



Definition 

def eb-«- ebits(prec) as 
case pref of 
16: 

eb-*- 5 

32: 

eb -*-8 

64: 

eb-#-11 

128: 

eb-»-15 

endcase 
enddef 

def eb ebias(prec)as 

eb -*-0|| iebits(prec)-1 
enddef 

def fb-«- fbits(prec) as 
fb-*- prec - 1 - eb 
enddef 

def a F(prec, ai) as 

a.s aiprec-1 

ae aiprec-2..fbits(prec) 

af -*-aifbit S (prec)-l..O 
if ae = lebits(prec) then 

if af = 0 then 

a.t INFINITY 

elseif affbit S (prec)-i then 
a.t -*-SNaN 
a.e -fbits(prec) 
a.f 1| affbits(prec)-1..0 

else 

a.t -*-QNaN 

a.e -fbits(prec) 

a.f -*-af 

endif 
elseif ae = 0 then 
if af = 0 then 

a.t-*- ZERO 



2570 



FIG. 25D-1 



else 

a.t-*- NORM 

a.e-«- 1-ebias(pec)-fbits(prec) 
a.f — OHaf 

endif 

else 

a.t -*-NORM 

a.e ae-ebias(prec)-fbits(prec) 
a.f-*-1||af 

endif 
enddef 



def a DEFAULTQNAN as 

a.s -*-0 

a.t-^QNAN 

a.e -1 

a.f-*-l 
endder 



def a DEFAULTSNAN as 

a.s-*-0 

a.t-*-SNAN 

a.e 

a.f 
enddef 



FIG. 25D-2 



def fadd(a.b) as faddr(a,b,N) endder 

def c — -faddr(a,b,round) as 

if a.t=NORM and b.t=NORM then 

// d,e are a,b with exponent aligned and fraction adjusted 
if a.e > b.e then 

d --a 

e.t— -b.t 

e.s— - b.s 

e.e — - a.e 

e.f -*-b.f || o a e ' b e 
else if a.e < b.e then 

d.t--a.t 

d.s — - a.s 

d.e —-b.e 

d.f — *- a.f J | o b e * a e 

e — -b 

endif 
c.t — d.t 
c.e— -d.e 
if d.s = e.s then 

c.s —-d.s 

c.f —-d.f + e.f 
elseif d.f > e.f then 

c.s —-d.s 

c.f —-d.f -e.f 
elseif d.f < e.f then 

c.s —-e.s 

c.f-- e.f - d.f 

else 

c.s — -r=F 
c.t --ZERO 
endif 



FIG. 25D-3 
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// priority is given to be operand for NaN propagation ' 
elseif (b.t=SNAN) or (b.t=QNAN) then 
c-*-b 

elseif (a.t=SNAN) or (a.t=QNAN) then 
c a 

elseif a.t=ZERO and b.t=ZERO then 
c.t-*-ZERO 

c.s (a.s and b.s) or (round=F and (a.s or b.s)) 
// NULL values are like zero, but do not combine with ZERO to alter sign 
elseif a.t=ZERO or a.t=NULL then 

c-*-b 

elseif b.t=ZERO or b.t=NULL then 
c a 

elseif a.t=INFINITY and b.t=lNFINITY then 
if a.s * b.s then 

c-*- OEFAULTSNAN //Invalid 

else 

c a 

endif 

elseif a.t=INFINITY then 

c -*-a 
elseif b.t=INFINITY then 

c-i-b 

else 

assert FALSE // should have covered all the cases above 

endif 
enddef 

def b fneg(a) as 

b.s-«~~a.s 

b.t-^a.t 

b.e -*-a.e 

b.f -*~a.f 
enddef 

def fsub(a,b) as fsubr(a,b,N) enddef 

def fsubr(a,b, round) as faddr(a t fneg(b) f round) enddef 

def frsub(a.b) as frsubr(a,b,N) enddef 

def frsubr(a t b,round) as faddr(fneg(a),b, round) enddef 
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def c-*- fcom(a.b) as 

if (a.t-SNAN) or (a.t=QNAN) or (b.t=SNAN) or (b.t=QNAN) then 
c ^ U 

elseif a.t=INFINITY and b.t=INFINITY then 
if a.s * b.s then 

c-«- (a.s=0) ? G: L 

else 

c E 
endif 

elseif a.t=INFINITY then 

c-*- (a.s=0) ? G: L 
elseif b.t=INFINITY then 

c-»-(b.s=0) ? L 
elseif a.t=NORM and b.t=NORM then 

if a.s * b.s then 

c -*-(a.s=0) ? G: L 

else 

if a.e > b.e then 
af-«-a.f 

bf-^-b.f||O ae " be 

else 

af-«- a.f||O b e " a e 
bf-*-b.f 

endif 

if af = bf then 
c-*-E 

else 

c^-((a.s=0) A (af >bO)?G : L 
endif 

endif 

elseif a.t=NORM then 

c-«-(a.s=0) ? G: L 
elseif b.t=NORM then 

c-«-(b.s=0) ? G: L 
elseif a.t=ZERO and b.t=ZERO then 

c-*-E 

else 

assert FALSE // should have covered al the cases above 

endif 
enddef 
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def c -*-fmul(a,b) as f 
if a.t=NORM and b.t=NORM then 
c.s -«-a.s A b.s 
c.t-*-NORM 

c.e a.e + b.e 

c.f — - a.f * b.f 
// priority is given to b operand for NaN propagation 
elseif (b.t=SNAN) or (b.t=QNAN) then 

c.s a.s A b.s 

c.t b.t 

c.e-*- b.e 

c.f -*-b.f 
elseif (a.t=SNAN) or (a.t=QNAN) then 

c.s -*-a.s A b.s 

c.t -*-a.t 

c.e -*-a.e 

c.f -*-a.f 
elseif a.t=ZERO and b.t=INFINITY then 

c-»- DEFAULTSNAN // Invalid 
elseif a.t=INFINITY and b.t=ZERO then 

c DEFAULTSN AN // Invalid 
elseif a.t=ZERO or b.t=ZERO then 

c.s -*-a.s A b.s 

c.t-*- ZERO 

else 

assert FALSE // should have covered al the cases above 
endif 
enddef 
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def c fdivr(a,b) as r 
if a.t=NORM and b.t=NORM then 
c.s-*- a.s A b.s 
c.t -*-NORM 
c.e a.e - b.e + 256 
c.f-*-(a.f 0 )/b.f 
// priority is given to b operand for NaN propagation 
elseif (b.t=SNAN) or (b.t=QNAN) then 
c.s-*- a.s A b.s 
c.t b.t 
c.e b.e 
c.f -*-b.f 
elseif (a.t=SNAN) or (a.t=QNAN) then 
c.s a.s A b.s 
c.t a.t 
c.e-*- a.e 
c.f a.f 
elseif a.t=ZERO and b.t=INFINITY then 

c DEFAULTSNAN // Invalid 
elseif a.t=INFINITY and b.t=INFINITY then 

c DEFAULTSNAN // Invalid 
elseif a.t=ZERO then 
c.s-*- a.s A b.s 
c.t-*- ZERO 
elseif a.t=INFINITY then 
c.s -*-a.s A b.s 
c.t-*- INFINITY 

else 

assert FALSE // should have covered al the cases above 
endif 
enddef 

def msb^- findmsb(a) as 

MAXF-+- 2 18 // Largest possible f value after matrix multiply 
forj-*-0 to MAXF 

ifa MAX F.i..j MO^^IlDthen 
msb-*- j 

endif 
endfor 
enddef 
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Def ai-«- PackF(prec,a,round) as 
case a.t of 
NORM: 

msb findmsb(a.f) 

m -*-msb-1-fbits(prec) //1sb for normal 

rdn^ -ebias(prec)-a.e-1-fbits(prec) // 1sb if a denormal 
rb (m > rdn) ? rn : rdn 
if rb < 0 then 

aifr-*- a.fmsb-i..o||0- rb 

eadj -*-0 

else 

case round of 

C: 

s ^_ 0 msb-rb|| , a s)rb 

F: 

s ^_o msb - rb || (a s) r b 
N, NONE: 

s ^.0msb-rb„. afrb||af rb-1 

A. 

if a.f r b.i..o * 0 then 

raise FloatingPointArithmetic // Inexact 



Z: 



endif 
s-*-0 



s-»-0 

endcase 

v^(0||a.f msb .. 0 ) + ( o||s) 
if v ms b=1 then 

aifr-^-v mS 5.-| ..rb 

eadj 0 

else 

aifr-*- 0 fbits (Pfec) 
eadj 1 
endif 
endif 

aien a.e + msb - 1 + eadj + ebias(prec) 
ifaiensOthen 

if round = NONE then 

ai^a.s||O ebi,s (P fec )||aifr 

else 

raise FloatingPointArithmetic //Underflow 

FIG. 25D-8 



endif 

elseif aien > lebits(prec) then 
if round = NONE then 

//default: round-to-nearest overflow handlinq 
ai a.s| | lebits(prec) 1 1 0 fbits(prec) a 

else 

raise FloatingPointArithmetic // Overflow 
endif 

else 

endif 3 '"*" a aienebi,s (P roc H -0 II ^ 

SNAN: 

if round * NONE then 

raise FloatingPointArithmetic //Invalid 
endif 

if -a.e < fbits(prec) then 

ai^-a.s||iebits(prec)|| a f ae1 Q || 0 fbits(prec)*a.e 

else 

Isb a.f-a. e -i-fbits(prec)+l..O *0 
endif"' 3 8 ' 1 1ebUS(PreC) l l a f -a.e-1..-a.e-1-fbits(prec).2 ||1sb 
QNAN: 

if -a.e < fbits(prec) then 

ai-»- a.s|| iebits(prec)|| a j. ae _, Q i | 0 fbits(prec)-a.e 

else 

1 S b a .f. a . e- 1-f bits(prec) +1 ..0*0 
endif ^ S " iebUS(PreC) ll a-f-a.e-1..-a.e-1-fbi,s(precK2l!lsb 
ZERO: 

ai-*- a.sll o ebits <P rec > 1 1 n^P' 60 ) 
INFINITY: 

ai a.s|| 1 eb '*s(prec)|j Qfbits(prec) 

endcase 
defdef 
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ai-*- fsinkr(prec, a, round) as 
case a.t of 
NORM: 

msb-*- findmsb(a.f) 
rb -*- -a.e 
if rb < 0 then 

aifr-*- a.fmsb..o||0- rb 
aims-*- msb - rb 

else 

case round of 
C.C.D: 

s -*-0 msb - rb ||(~ai.s) rb 

F.F.D: 

s-*-0 msb * rb ||(ai.s) rb 
N, NONE: 

s^-O msb - rb |hai.f rb ||a«.f r t 1 

X. 

if ai.frb-i..o * 0 then 

raise FloatingPointArithmetic // Inexact 
endif 

s-^-0 

Z, Z.D: 

s -*-0 

endcase 

v^(0||a.f msb .. 0 ) + (0||s) 
if v ms b=1 then 

aims -*- msb + 1 - rb 

else 

aims-*- msb - rb 
endif 

aifr-*- v a j m8 ..rb 
endif 

if aims > prec then 
case round of 

CD, F.D, NONE, Z.D: 

ai-*- a.s||(~as)prec-i 
C.F.N.X.Z: 

raise FloatingPointArithmetic // Overflow 

endcase 
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elseif a.s = 0 then r 
ai -*-aifr 

else 

ai-*--aifr 

endif 
ZERO: 

aj-»- 0P rec 
SNAN, QNAN: 
case round of 

CD, F.D, NONE, Z.D: 

ai^*. 0P rec 
C, F, N, X, Z: 

raise Floatingpoint Arithmetic // Invalid 

endcase 
INFINITY: 

case round of 

CD, F.D, NONE, Z.D: 

ai a.s || (~as)P rec - 1 
C, F, N, X, Z: 

raise FloatingPointArithmetic // Invalid 

endcase 

endcase 
enddef 



def c frecrest(a) as 
b.s 0 
b.t -*-NORM 
b.e-«-0 
b.f -#-1 

c -*-fest(fdiv(b,a)) 
enddef 

def c frsqrest(a) as 
b.s-*-0 
b.t NORM 
b.e-^0 
b.f — -1 

c fest(fsqr(fdiv(b,a))) 
enddef 
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2570 



def c -«-fest(a) as 
if (a.t=NORM) then 

msb -*-findmsb(a.f) 
a.e-*-a.e +msb- 13 
a.f a.f mS b..msb-12|| 1 

else 

c a 
endif 
enddef 

def fsqr(a) as 

if (a.t=NORM) and (a.s=0) then 
c.s -*-0 
c.t-*-NORM 
if (a.e 0 =1) then 

c.e -«-(a.e-127)/2 
c.f ^-sqr(a.f||0 127 ) 

else 

c.e -*-(a.e-128) / 2 
c.f^sqr(a.f||0 1 28) 
endif 



elseif (a.t=SNAN) or (a.t-QNAN) or a.t=ZERO or ((a.t=INFINITY) and 
ia.s=u)) then 



c-#-a 



elseir ((a.t=NORM) or (a.t=INFINITY)) and (a.s=1) then 
c -*-DEFAULTSNAN // Invalid 

else 

assert FALSE // should have covered a1 the cases above 



endif 
enddef 
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Operation codes 



G ADD 8 


Group add bytes 


G ADD 16 1 


uroup add doublets 


G ADD 17 


Group add quadlets 


G ADD 64 


Group add octlets 


G ADD 17K 


Group add hexlet 


G ADD T 8 


Uroup add limit signed bytes 


G ADD T 1 6 


Uroup add limit signed doublets 


G ADD T 1? 


uroup add limit signed quadlets 


G ADD T 64 


uroup add limit signed octlets 


G ADD T 198 1 


uroup add limit signed hexlet 


G ADD I IT R 


Group add limit unsigned bytes 


G ADD T TT 16 


Uroup add limit unsigned doublets 


G ADD T Tn? 


Group add limit unsigned quadlets 


G ADD T T T £4 


Group add limit unsigned octlets 


G ADD T II 178 


Group add limit unsigned hexlet i 


G.ADD.8.0 


Group add signed bvtes check overflow 


G.ADD.16.0 


Group add signed doublets check overflow 


G.ADD.32.0 


Group add signed quadlets check overflow 


G.ADD.64.0 


Group add signed octlets check overflow 


G.ADD.128.0 


Group add signed hexlet check overflow 


G.ADD.U.8.0 


Group add unsigned bytes check overflow 


G.ADD.U.16.0 


Group add unsigned doublets check overflow 


G.ADD.U.32.0 


Group add unsigned quadlets check overflow 


G.ADD.U.64.0 


Group add unsigned octlets check overflow 


G.ADD.U.128.0 


Group add unsigned hexlet check overflow 
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Format 



G. op. size rd=rc,rb 
rd=gopsize(rc,rb) 

31 24 23 18 17 12 11 65 0 

f G.size 1 rd | rc 1 rb 1 op 1 

8 6 6 6 6 
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Definition 

def Group(op,size,rd,rc,rb) 
c *- RegRead(rc, 128) 
b <- RegRead(rb, 128) 
case op of 
G.ADD: 

for i <- 0 to 128-size by size 

ai+size-L.i <— ci+size-l..i + bi+size-l..i 
endfor 
G.ADD.L: 

for i <- 0 to 128-size by size 

t «- (ci+size-1 II ci+size-L.i) + (bi+size-1 II bi+size-l..i) 

ai+size-L.i <- (tsize * t s ize-l) ? (tsize II t§15g:l) : t s ize-1..0 
endfor 
G.ADD.L.U: 

for i «— 0 to 128-size by size 

t <- (Ol || ci+size-l..i) + (Ol || bi+size-L.i) 
ai+size-L.i <- (tsize * 0) ? (isize) : t s ize-1..0 
endfor 
G.ADD.O: 

for i <- 0 to 128-size by size 

t <- (ci+size-1 II ci+size-l..i) + (bi+size-1 II bi+ s ize-l..i) 
if tsize ^ tsize-l then 

raise FixedPointArithmetic 
endif 

ai+size-1 ..i «~ tsize-l ..0 
endfor 
G.ADD.U.O: 

for i <— 0 to 128-size by size 

t <- (Ol || ci+size-l..i) + (Ol || bi+size-1. .0 
if tsize * 0 then 

raise FixedPointArithmetic 
endif 

ai+size-1 ..i <- tsize- L.O 
endfor 

endcase 

RegWrite(rd, 128, a) 
enddef 
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Operation codes 



G.SET.AND.E.8 


vjiuuu ovl oXlU CUUal ZCTQ OyivS 


G.SET.AND.E.16 


I iroiiri ^Pf anri p/iiij»I 7f*m 

act cUlvi ClfUal UUUOtviS 


G.SET.AND.E.32 


viTOlin ^Pt Jind pniis*l 7pm nnorllptc 

vjivup Ovl CUlii CLjuai Z.V1 U LjUaLLlvlo 


G.SET.AND.E.64 


CiTOlin QPt and pniiai ■rpm r\r»tlf»te 
vjiuujj ovl cuivi vljUal £vlv* VJVUClo 


G.SET.AND.E.128 


VJiwu^ ivl «xllU vlfUaJ ZviU IlCvLivl 


G.SET.AND.NE 8 


CrrOllT^ SPt and nrtf pniital "7Prr\ Ywrtae 
vjujup owl <uiu UUl ClfUaJ 4wtU OytCS 


G.SET.AND.NE. 16 


vjiuujj ovi aiiu noi cqum zero uou Diets 


G.SET.AND.NE.32 


vjivujj ovl aiiu uui CLjUai ZCIU LjUclUlcLS 


G.SET.AND.NE. 64 


vjiuujj ovl ailU I1UI vqUal ZviO OvllvTo 


G.SET.AND.NE. 128 


tirniin SPt anH not pniial 7Pro hp-vl#»t 
vjiv/up ovi cuiu imji visual lciu llvXJvl 


G.SET.E.8 


firm in Qpt pnn^l H\/tpc 

vjiuup ovl visual Uylvo 


G SETE 16 


VJlUup ovl CLjlldl UUUDICIS 


G SET E 32 


vjlUUp ovl vljUal qUaUlvlo 


G.SET.E.64 


Groun spt Pfiual rv^tlptc 
\ji vu^r ovl. VLjuai Uvllvlo 


G.SET.E.128 


Groun set pnn^l hpvlpt 

VJ»VJUJJ OVt VLJUal llVAJVl 


G.SET.GE.8 


Groun set creafpr pnnal citmprl h\/t/=>«? 
viiv/u^/ ovl givdivi visual oij^iIvU Uylvo 


G.SET.GE.16 


Ctronn <\pt orp?itpr pmisil eicm<=*H HAuK!»tc 

OVt K,l Ulltl VLjUdl olgUvil UUUDlvlS 


G.SET.GE.32 


Groun set ffreater pnnal QicynpH mmrllptc 

^'•""P OVt £1VALV1 visual oi^livU LfUdUivlo 


G.SET.GE.64 


GrouD set greater enuaJ <ii ot»pH /vtlpfc 


G.SET.GE.128 


GrouD set creater eaual siffneH hpvlpt 


G.SET.GE.U.8 


Grouo set creater eaual un^ianprl hvtp« 


G.SET.GE.U.16 


GrouD set creater enua! nn<iianpH rlnnhlptc 


G.SET.GE.U.32 


(irnim <SPt OrP^tPr Pnil^l nnoirmpH nnarilafp 
ovi ^ivaivi visual Uiiolgllvvl CjUaXlJClS 


G SET GE U 64 


vjiuup oci ^icaivi cCjUai unsigncu ocuets 


GSETGEU 128 


liroim cpt orpatpr pnnal nnciftriA/l tiovlAt 
vJiuujj ovl ^1C<JICI vljlla.1 UiloJgJJvQ jjvXJvl 


G.SET.L.8 


Orr^nn cpt cicmpH Ipcc K\rtpe 
\jiuup ovl oigiivU lvoo uyivo 


G SETL 16 

VP • ' Aw A ■ AW - A V-r 


vjiuup ovl oigllCU lvoo aOUDlvlo 


G.SET.L.32 


Cirniin cpf cicmpH Ipcc nu^rllptc 
vjiuu}J oci olgllvU ICoo IjUaUlClo 


G.SET.L.64 


Cimno ^Pt qiotipH 1p«c ivtlpte 
vit vup ovi oij^iivu lvoo l/vllvlo 


G.SET.L.128 


Groun set sicmeH Ipcq hpvlpt 

vjiwuv/ ovl ai^iiwu 1VOO llvAJCl 


G.SET.L.U.8 


Grouo set less unsigned hvtps 


G.SET.L.U.16 


GrouD set less unsigned doublets 


G.SET.L.U.32 


GrouD set less unsigned auadlets 


G.SET.L.U.64 


GrouD set less unsigned octlets 


G.SET.L.U.128 


Group set less unsigned hexlet 


G.SET.NE.8 


Group set not equal bytes 


G.SET.NE.16 


Group set not equal doublets 


G.SET.NE.32 


Group set not equal quadlets 


G.SET.NE.64 


Group set not equal octlets 


G.SET.NE.128 


Group set not equal hexlet 


G.SUB.8 


Group subtract bytes 


G.SUB.8.0 


Group subtract signed bytes check overflow 
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G.SUB.16 


Group subtract doublets 


G.SUB.16.0 


Group subtract siened doublets ch^rk overflow 


GSUB.32 


Group subtract quadlets 


G.SUB.32.0 


Group subtract siened quadlets check overflow 


G. SUB. 64 


Group subtract octlets 


G.SUB.64.0 


Group subtract siened octlets check nverflmi/ 


GSUB.128 


Group subtract hexlet 


GSUB.128.0 


Group subtract siened hexlet check: nverfln\i/ 


G.SUB.L.8 


Group subtract limit signed bytes 


G.SUB.L.16 


Group subtract limit signed doublets 


G.SUB.L.32 


Group subtract limit signed quadlets 


G.SUB.L.64 


Group subtract limit signed octlets 


G.SUB.L.128 


Group subtract limit signed hexlet 


G.SUB.L.U.8 


Group subtract limit unsigned bytes 


GSUB.L.U.16 


Group subtract limit unsigned doublets 


GSUB.L.U.32 


Group subtract limit unsigned quadlets 


G.SUB.L.U.64 


Group subtract limit unsigned octlets 


GSUB.L.U.128 


Group subtract limit unsigned hexlet 


G.SUB.U.8.0 


Group subtract unsigned bytes check overflow \ 


G.SUB.U.16.0 


Group subtract unsigned doublets check overflow 


G.SUB.U.32.0 


Group subtract unsigned quadlets check overflow 


G.SUB.U.64.0 


Group subtract unsigned octlets check overflow 


G.SUB.U.128.0 


Group subtract unsigned hexlet check overflow 
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Format 



G.op.size rd=rb,rc 
rd=gopsize(rb,rc) 

31 24 23 18 17 12 11 65 0 

G.size | rd 1 rc | rb | op | 

8 6 6 6 6 
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Definition 

def GroupReversed(op,size, rd, rc, rb) 
c RegRead(rc, 128) 
b <- RegRead(rb, 128) 
case op of 
G.SUB: 

for i <- 0 to 128-size by size 

ai+size-L.i <- bi+ s ize-l..i - ci+size-L.i 
endfor 
G.SUB.L: 

for i <- 0 to 128-size by size 

t <- (bi+size-1 II bi+size-l..i) - (ci+ s ize-l II ci+ s ize-l..i) 

ai+size-l..i <- (tsize * t s i Z e-l) ? (tsize II '• tsize-1..0 

endfor 
G.SUB.LU: 

for i <- 0 to 128-size by size 

t <e- (0 1 || bi + size.i..i) - (0 1 1| ci + size-l..i) 

ai+size-l..i <- (tsize * 0) ? 0 size : t s i 2 e-1..0 
endfor 
G.SUB.O: 

for i <— 0 to 128-size by size 

t <- (bi+size-1 II bi+size-1. i) - (ci+size-l II ci+ s ize-l..i) 
if (tsize * tsize- 1) then 

raise FixedPointArithmetic 

endif 

ai+size-l..i <~ t s ize- 1. 0 
endfor 
G.SUB.U.O: 

for i «- 0 to 128-size by size 

t <- (0 l || bi+ s i 2 e-i..i) - (0 1 || ci+ s i Z e-l..i) 
if (tsize * 0) then 

raise FixedPointArithmetic 

endif 

ai+size-l..i <- tsize-1..0 
endfor 
G.SET.E: 

for i 0 to 128-size by size 

ai+size-l..i (bi+size-l..i ~ ci+ s ize-l..i) size 
endfor 
G.SET.NE: 

for i <- 0 to 128-size by size 

ai+size-1. i <- (bi+size-1. .i * ci+ s i 2 e-l. .i) slze 
endfor 
G.SET.AND.E: 

for i <- 0 to 128-size by size 

ai+size-L. i <- ((bi+size-1. .i and ci+ s i ze -l j) = 0) size 
endfor 
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G.SET.AND.NE: 

for i 0 to 128-size by size 

ai+size-l..i <- ((bi+ s ize-l..i and ci+ s i ze ~i . i) * 0) slze 
endfor 
G.SET.L: 

for i <- 0 to 128-size by size 

ai+size-l..i <- ((rc = rb) ? (bi+ s ize-l..i < 0) : (bi+size-l..i < ci+ s ize-l..i)) size 
endfor 
G.SET.GE: 

for i +- 0 to 128-size by size 

ai+size-l..i <- ((rc = rb) ? (bj+size-L.i > 0) : (bi+size-l..i ^ ci +s ize.l..i)) size 
endfor 
G.SET.L.U: 

for i <- 0 to 128-size by size 

ai+size-l..i <- ((rc = rb) ? (bi+ s ize-l..i > 0) : 
((0 || bi +s i ze -i..i) < (0 || Ci +S i2e.l..i))) size 

endfor 
G.SET.GE.U: 

for i <- 0 to 128-size by size 

ai+size-l..i <~ ((rc = rb) ? (bi+ s ize-L.i * 0) : 
((0 || bi+size-1 .0 ^ (0 || c i+s ize-L.i))) size 

endfor 

endcase 

RegWrite(rd, 128, a) 
enddef 
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Operation codes 



E.C0N.8 


Ensemble convolve signed bytes 


E.CON.16 


Ensemble convolve signed doublets 


E.CON.32 


Ensemble convolve signed quadlets 


E.C0N.64 


Ensemble convolve signed octlets 


E.C0N.C.8 


Ensemble convolve complex bytes 


E.C0N.C.16 


Ensemble convolve complex doublets 


E.CON.C.32 


Ensemble convolve complex quadlets 


E.C0N.M.8 


Ensemble convolve mixed-signed bytes 


E.C0N.M.16 


Ensemble convolve mixed-siened doublets 


E.CON.M.32 


Ensemble convolve mixed-signed quadlets 


E.CON.M.64 


Ensemble convolve mixed-signed octlets 1 


E.C0N.U.8 


Ensemble convolve unsigned bytes 


E.C0N.U.16 


Ensemble convolve unsigned doublets 


E.CON.U.32 


Ensemble convolve unsigned quadlets 


E.CON.U.64 


Ensemble convolve unsigned octlets 


E.DIV.64 


Ensemble divide signed octlets 


E.DIV.U.64 


Ensemble divide unsigned octlets 


E.MUL.8 


Ensemble multiply signed bytes 


E.MUL.16 


Ensemble multiply signed doublets 


E.MUL.32 


Ensemble multiply signed quadlets 


E.MUL.64 


Ensemble multiply signed octlets 


E.MUL.SUM.8 


Ensemble multiply sum signed bytes 


E.MUL.SUM.16 


Ensemble multiply sum signed doublets 


E.MUL.SUM.32 


Ensemble multiply sum signed quadlets 


E.MUL.SUM.64 


Ensemble multiply sum signed octlets 


E.MUL.C.8 


Ensemble complex multiply bytes 


E.MUL.C.16 


Ensemble complex multiply doublets 


E.MUL.C.32 


Ensemble complex multiply quadlets 


E.MUL.M.8 


Ensemble multiply mixed-signed bytes ~] 


E.MUL.M.16 


Ensemble multiply mixed-signed doublets 


E.MUL.M.32 


Ensemble multiply mixed-signed quadlets 


E.MUL.M.64 


Ensemble multiply mixed-signed octlets 


E.MUL.P.8 


Ensemble multiply polynomial bytes 


E.MUL.P.16 


Ensemble multiply polynomial doublets 


E.MUL.P.32 


Ensemble multiply polynomial quadlets 


E.MUL.P.64 


Ensemble multiply polynomial octlets 


E.MUL.SUM.C.8 


Ensemble multiply sum complex bytes 


E.MUL.SUM.C.16 


Ensemble multiply sum complex doublets 


E.MUL.SUM.C.32 


Ensemble multiply sum complex quadlets 


E.MUL.SUM.M.8 


Ensemble multiply sum mixed-signed bytes 


E.MUL.SUM.M. 16 


Ensemble multiply sum mixed-signed doublets 


E.MUL.SUM.M.32 


Ensemble multiply sum mixed-signed quadlets 


E.MUL.SUM.M.64 


Ensemble multiply sum mixed-signed octlets 
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E.MUL.SUM.U.8 


Ensemble multiply sum unsigned bytes 


E.MUL.SUM.U.16 


Ensemble multiply sum unsigned doublets 


E.MUL.SUM.U.32 


Ensemble multiply sum unsigned quadlets 


E.MUL.SUM.U.64 


Ensemble multiply sum unsigned octlets 


E.MUL.U.8 


Ensemble multiply unsigned bytes 


E.MUL.U.16 


Ensemble multiply unsigned doublets 


E.MUL.U.32 


Ensemble multiply unsigned quadlets 


E.MUL.U.64 


Ensemble multiply unsigned octlets 
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Format 



E.op.size rd=rc,rb 
rd=eopsize(rc,rb) 

31 24 23 18 17 12 11 65 0 

| E.size | rd | rc | rb | op 

8 6 6 6 6 
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Definition 



def mul(size,h,vs,v,i,ws,wj) as 

mul <r- ((vs&v si2e -l+i) h - size || v s i 2 e-l+Li) * ((ws&w si2e . 1+j )h-size || w s ize-l+j. j) 
enddef 

def c <- PolyMultiply(size,a,b) as 

p[0] <- 0 2 *size 

for k <- 0 to size-1 

p[k+l] <- p[k] A ak ? (0 size - k || b || 0 k ) : 0 2 *size 

endfor 

c <— p[size] 
enddef 

def Ensemble(op,size,rd,rc,rb) 
c <- RegRead(rc, 128) 
b *- RegRead(rb, 128) 
case op of 

E.MUL:, E.MUL.C:, EMUL.SUM, E.MUL.SUM.C, E.CON, E.CON.C, E.DIV: 

cs «- bs <- 1 
E.MUL.M:, EMUL.SUM.M, E.CON.M: 

CS 4- 0 

bs <- 1 

E.MUL.U:, EMUL.SUM.U, E.CON.U, E.DF/.U, E.MUL.P: 
cs <- bs <- 0 

endcase 
case op of 

E.MUL, E.MUL.U, E.MUL.M: 
for i <- 0 to 64-size by size 

d2*(i+size)-L.2*i <~ mul(size,2*size,cs,c,i,bs,b,i) 
endfor 
E.MUL.P: 

for i «— 0 to 64-size by size 

d2*(i+size)-1..2*i <- PolyMultiply(size,c s ize-l+i..i,b s ize-l-Hi..i) 
endfor 
E.MUL.C: 

for i <- 0 to 64-size by size 
if (i and size) = 0 then 

p <- mul(size,2*size,l,c,i,l,b,i) - mui(size,2*size,l,c,i+size,l,b,i+size) 

else 

p <- mul(size,2*size,l,c,i,l,b,i+size) + mul(size,2*size,l,c,i,l,b,i+size) 

endif 

d2*(i+size)-1..2*i<-p 
endfor 

E MUL.SUM, E MUL.SUM.U, E MUL.SUM.M: 
p[0]<-0 128 

for i <r- 0 to 128-size by size 

p[i+size] <- p[i] + mul(size,128,cs,c,i,bs,b,i) 
endfor 
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a*-p[128] 

E.MUL.SUM.C: 
p[0] <- 0 64 
p[size] <- O 64 
for i <- 0 to 128-size by size 
if (i and size) = 0 then 

p[i+2*size] <- p[i] + mul(size,64,l,c,U,b,i) 

- mul(size,64, 1 ,c,i+size, 1 ,b,i+size) 

else 

p[i+2*size] «- p[i] + mul(size,64,l,c,i,l,b,i+size) 
+ mul(size,64,l,c,i+size,l,b,i) 

endif 
endfor 

a<-p[128+size] || p[128] 

E.CON, E.CON U, E.CON.M: 
p[0] <- 0 128 

for j <- 0 to 64-size by size 

for i «— 0 to 64-size by size 

p(j+size]2*(i+size)-1..2*i <- PUl2*(i+size)-1..2*i + 
muI(size,2*size,cs,c,i+64-j,bs,bj) 

endfor 
endfor 
a p[64] 

E.CON.C: 

p[0]«-0 128 

for j <- 0 to 64-size by size 

for i 0 to 64-size by size 

if ((~i) and j and size) = 0 then 

p[j+size]2*(i+size)-1..2*i <- p[j]2*(i+size)-1..2*i + 
mul(size,2*size,l,c,i+64-j,l,bj) 

else 

p[j + size]2*(i+size)-1..2*i <- p[j]2*(i+size)-1..2*i - 
mul(size,2*size, 1 ,c,i+64-j+2*size, 1 ,b j) 

endif 
endfor 
endfor 
a <- p[64] 
E.DIV: 

if (b = 0) or ( (c = (1||0<>3)) and (b = I 64 ) ) then 
a <r- undefined 
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q <- c / b 
r <— c - q*b 

a T63..0 II q63..0 

endif 
E.DrV.U: 

ifb = 0 then 

a <r- undefined 

else 

q«-(0 || c)/(0 || b) 
r<- c -(0 || q)*(0 || b) 
a<- r63..0ll q63..0 

endif 

endcase 

RegWrite(rd, 128, a) 
enddef 
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Floating-point function Definitions 

def eb «- ebits(prec) as 
case pref of 



16: 






eb <- 5 


32: 






eb«- 8 


64: 






eb<- 11 


128: 






eb<- 15 


endcase 





enddef 

def eb ebias(prec) as 

eb<-0|| iebits(prec)-l 
enddef 

def fb <r- fbits(prec) as 

fb <— prec - I - eb 
enddef 

def a <r- F(prec, ai) as 
a.s <— aip r ec-l 
ae<- ai p rec-2..fbits(prec) 
af ^-aifbits(prec)-1..0 
ifae= iebits(prec) 
ifaf=0 then 

a t <- INFINITY 
elseif affbits(prec)-l then 
at <- SNaN 
a.e <— -fbits(prec) 
a.f <- 1 || affbits(prec)-2..0 

else 

a.t <r~ QNaN 
a.e <— -fbits(prec) 
a.f <-af 

endif 
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elseif ae = 0 then 
if af = 0 then 

a t <- ZERO 

else 

a t <- NORM 

a.e «— l-ebias(prec)-fbits(prec) 
a.f«-0||af 

endif 

else 

a t <- NORM 

a.e <r- ae-ebias(prec)-fbits(prec) 
a.f <- 1 || af 

endif 
enddef 

def a <- DEFAULTQNAN as 

a.s <- 0 

a t <- QNAN 

a.e <— -I 

a.f <r- 1 
enddef 

def a <- DEFAULTSNAN as 

a.s <— 0 

a.t +- SNAN 

a.e < — 1 

a.f <- 1 
enddef 

def fadd(a,b) as faddr(a,b,N) enddef 

def c <r- faddr(a,b,round) as 

if a.t=NORM and b.t=NORM then 

// d,e are a,b with exponent aligned and fraction adjusted 
if a.e > b e then 
d <— a 
e.t <- b.t 
e.s <- b.s 
e.e <- a.e 

e.f«-b.f|| (>a.e-b.e 
else if a.e < b.e then 
d.t «- a.t 
d.s <- a.s 
d.e <- b e 

d.f<-a.f||O be - ae 
e <— b 
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endif 
c.t <~ d.t 
c.e <— d.e 
if d.s = e.s then 

c.s «— d.s 

c.f<-d.f+e.f 
elseif d.f > e.f then 

c.s <- d.s 

c.f<-d.f-e.f 
elseif d.f < e.f then 

c.s e.s 

c.f<-e.f-d.f 

else 

c.s <— r=F 
c.t <r- ZERO 

endif 

// priority is given to b operand for NaN propagation 
elseif (b.t=SNAN) or (b.t=QNAN) then 
c 4- b 

elseif (a.t=SN AN) or (a.t=QNAN) then 
c <- a 

elseif a.t=ZERO and b.t=ZERO then 
c.t <r- ZERO 

c.s (a.s and b.s) or (round=F and (a.s or b.s)) 
// NULL values are like zero, but do not combine with ZERO to alter sign 
elseif a.t=ZERO or a.t=NULL then 

c <- b 

elseif b.t=ZERO or b.t=NULL then 
c <— a 

elseif a.t=INFINITY and b.t= INFINITY then 
if a.s ^ b.s then 

c DEFAULTSNAN // Invalid 

else 

c <- a 

endif 

elseif a.t=INFINITY then 
c <- a 

elseif b.t=INFINITY then 
c <- b 

else 

assert FALSE // should have covered al the cases above 

endif 
enddef 

def b <- fheg(a) as 

b.s < — -a.s 

b.t <r- a.t 

b e <— a.e 

b.f<-a.f 
enddef 
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def fsubr(a,b,round) as faddr(a,fheg(b),round) enddef 



def frsub(a,b) as frsubr(a,b,N) enddef 

def frsubr(a,b, round) as faddr(fheg(a),b, round) enddef 

def c <— fcom(a,b) as 

if (a.t=SNAN) or (a.t=QNAN) or (b.t=SNAN) or (b.t=QNAN) then 
c<-U 

elseif a.t=INFINITY and b.t= INFINITY then 
if a s * b.s then 

c <r- (a.s=0) ? G: L 

else 

c <— E 

endif 

elseif a. t=INFINITY then 

c <- (a.s=0) ? G: L 
elseif b.t= INFINITY then 

c <- (b.s=0) ? G: L 
elseif a.t=NORM and b.t=NORM then 

if a s * b.s then 

c <- (a.s=0) ? G: L 

else 

if a.e > b e then 
af <— a.f 

bf<-b.f|| 0 a e ' b e 

else 

af<-a.f ||O b -e-a.e 
bf<-b.f 

endif 

if af = bf then 

C <r- E 

else 

c <- ((a.s=0) A (af > bf)) ? G : L 

endif 

endif 

elseif a.t=NORM then 

c <r- (a.s=0) ? G: L 
elseif b.t=NORM then 

c <- (b.s=0) ? G: L 
elseif a.t=ZERO and b.t=ZERO then 

c <- E 

else 

assert FALSE // should have covered al the cases above 

endif 
enddef 
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def c <r~ frnul(a,b) as 

if a.t=NORM and b.t==NORM then 

c.s <— a.s A b.s 

c.t <- NORM 

c.e <— a.e + b.e 

c.f<-a.f * b.f 
// priority is given to b operand for NaN propagation 
elseif (b.t=SNAN) or (b.t=QNAN) then 

c.s <— a.s A b.s 

c.t <r- b.t 

c.e b.e 

c.f<- b.f 

elseif (a.t=SN AN) or (a.t=QNAN) then 
c.s <— a s A b.s 
c.t <- a.t 
c.e <- a.e 
c.f <- a.f 

elseif a.t=ZERO and b.t=INFINITY then 

c <r- DEFAULTSNAN // Invalid 
elseif a. t=INFINITY and b.t=ZERO then 

c <- DEFAULTSNAN // Invalid 
elseif a.t=ZERO or b.t=ZERO then 

c.s <— a.s A b.s 

c.t <- ZERO 

else 

assert FALSE // should have covered al the cases above 

endif 
enddef 

def c <- fdivr(a,b) as 

if a.t=NORM and b.t=NORM then 
c.s <— a.s A b.s 
c.t <r- NORM 
c.e <r- a.e - b.e + 256 

c.f <- (a.f || 0 256 )/b.f 
// priority is given to b operand for NaN propagation 
elseif (b.t=SNAN) or (b.t=QNAN) then 

c.s <— a.s A b.s 

c.t <- b.t 

c.e <- b.e 

c.f <r- b.f 

elseif (a.t=SNAN) or (a.t=QNAN) then 
c.s <— a s A b.s 
c.t <— a t 
c.e <- a.e 
c.f <- a.f 
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elseif a.t=ZERO and b.t=ZERO then 

c <- DEFAULTSNAN // Invalid 
elseif a.t=INFINITY and b.t= INFINITY then 

c <- DEFAULTSNAN // Invalid 
elseif a.t=ZERO then 

c.s <- a.s A b.s 

c.t <- ZERO 
elseif a.t=rNFINITY then 

c.s «— a.s A b.s 

c.t <- INFINITY 

else 

assert FALSE // should have covered al the cases above 

endif 
enddef 

def msb <- findmsb(a) as 

MAXF <- 2 18 // Largest possible f value after matrix multiply 
forj<-0to MAXF 

if aMAXF-1. j = (oMAXF-l-j | i) then 
msb <- j 

endif 
endfor 
enddef 

def ai <- PackF(prec,a,round) as 
case a.t of 
NORM: 



msb <r- findmsb(a.f) 

rn msb-l-fbits(prec) // lsb for normal 

rdn <- -ebias(prec)-a.e-l-fbits(prec) // lsb if a denormal 

rb <— (rn > rdn) ? rn : rdn 




ifrb<0 then 

aifT<-a.f m sb-1..0ll 0'^ 
eadj <r- 0 

else 

case round of 



C: 








s <— o m sb-rb | 


I (~a.s) rb 


F: 






s <- O ms b-rb | 


| (as)* 


N, 


NONE: 




s <r- O ms b-rb | 


| ~a.f rb || a.f$- 


X: 







if a.f r b-1..0 * 0 then 

raise FloatingPointArithmetic // Inexact 

endif 

S <r- 0 

Z: 

s <- 0 

endcase 

v<-(0||a.fmsb..0) + (0||s) 
if v m sb = 1 then 

aifr<- vmsb-l..rb 
eadj <- 0 

else 

aifr <- O^^sCprec) 
eadj <~ 1 

endif 

endif 

aien <- a.e + msb - 1 + eadj + ebias(prec) 
if aien < 0 then 

if round = NONE then 

ai <- a s || O ebits (P r ec) || a ifr 

else 

raise FloatingPointArithmetic //Underflow 

endif 

elseif aien > iebits(prec) th en 
if round = NONE then 

//default: round-to-nearest overflow handling 
ai <- a.s || iebits(prec) || 0 fbits(prec) 

else 

raise FloatingPointArithmetic //Underflow 

endif 

else 

ai <- a.s || aienebits(prec)-1..0 II aifr 

endif 
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SNAN: 

if round* NONE then 

raise FloatingPointArithmetic //Invalid 

endif 

if -a.e < fbits(prec) then 

ai <- a.s || lebits(prec) y a .f- a .e-1..0 II 0 £bits (P rec ) +a e 

else 

lsb <- a.f- a .e-l-fbits(prec)+1..0 * 0 

ai <- a.s || iebits(prec) || a.f- a .e-l..-a.e-l-fbits(prec)-H2 II lsb 

endif 
QNAN: 

if -a.e < fbits(prec) then 

ai a.s || lebits(prec) || a .f. a .e-1..0 II ofoMP"*)** e 

else 

lsb <- a.f- a . e -l-fbits(prec)+1..0 * 0 

ai a s || iebits(prec) || a.f. a . e . L .. a . e .l.fbits(prec)+2 II lsb 

endif 
ZERO: 

ai <- a.s || 0 ebits (P rec ) || O^O^ 0 ) 
INFINITY: 

ai <- a.s || iebits(prec) y 0 ft>its(prec) 

endcase 
defdef 

def ai <— fsinkr(prec, a, round) as 
case a.t of 
NORM: 

msb <- findmsb(a.f) 
rb < — a.e 
if rb <0 then 

aifr<-a.f ms b..Oll 0" rb 
aims <— msb - rb 

else 

case round of 
C, C D: 

s <- O^sb-rb || (^ a i.s) rb 
F, F.D: 

s *- omsb-rb \\ ( a i. s ) rb 
N, NONE: 

s <- 0 msb - rb || -ai.frb II ai-fj^ 1 

X: 

ifai.f r b-1..0*0then 

raise FloatingPointArithmetic // Inexact 

endif 
s <- 0 
Z, Z D: 

S <r- 0 
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. endcase 
v <- (0||a.fmsb..O) + (0||s) 
if vm S b = 1 then 

aims <- msb + 1 - rb 

else 

aims <— msb - rb 

endif 

aifr<- vaims. rb 

endif 

if aims > prec then 
case round of 

CD, F.D, NONE, Z D: 
ai<-a.s || H^P^-l 



C, F, N, X, Z: 

raise FIoatingPointArithmetic // Overflow 

endcase 
elseif a.s = 0 then 
ai <— aifr 

else 

ai <- -aifr 

endif 
ZERO: 

ai <_ oprec 
SNAN, QNAN: 
case round of 

C D, F.D, NONE, Z.D: 

ai <_ oprec 
C, F, N, X, Z: 

raise FIoatingPointArithmetic // Invalid 

endcase 
INFINITY: 

case round of 

C D, F.D, NONE, Z D: 

ai <r- a.s || (-as^reo 1 
C, F, N, X, Z: 

raise FIoatingPointArithmetic // Invalid 

endcase 

endcase 
enddef 



def c <r- frecrest(a) as 
b.s <- 0 
b.t <- NORM 
b.e<-0 
b.f<- 1 

c <- fest(fdiv(b,a)) 
enddef 
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def c <— frsqrest(a) as 
b.s <- 0 
b.t NORM 
b.e <- 0 
b.f <e- 1 

c <r- fest(fsqr(fdiv(b,a))) 
enddef 



def c <— fest(a) as 

if(a.t=NORM) then 

msb <- findmsb(a.f) 
a.e <— a.e + msb - 13 
a.f<-a.f ms b..msb-12 II 1 

else 

c <- a 

endif 
enddef 



def c <- fsqr(a) as 

if (a.t=NORM) and (a.s=0) then 
c.s <- 0 
at <r- NORM 
if (a.eo = 1) then 

c.e<- (a.e-127)/2 
c.f<-sqr(a.f||0 127 ) 

else 

c.e <- (a.e-128)/2 
c.f<- sqr(a.f ||0 128 ) 

endif 

elseif (a.t=SNAN) or (a.t=QNAN) or a.t=ZERO or ((a.t=INFINITY) and (a.s=0)) then 
c 4- a 

elseif ((a.t=NORM) or (a.t=INFINITY)) and (a.s=l) then 
c <- DEFAULTSNAN // Invalid 

else 

assert FALSE // should have covered al the cases above 

endif 
enddef 
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Operation codes 



E ADD F 16 


Ensemble add floaiino-nnint h^lf 

Lxitowiiiuiv "Uw ll\Jallllg*UUl]Jt Hail 


E ADD F 16 C 


Hn^emhle add tlnatino-nr^inf half r*c±\\\r\ci 

t'tU^lilUlW OUU HUClLillg — IJtJll 11 Hull CClllIlt^ 


E ADD F 16 F 


Ensemble add flnatino-noint half flnnr 


E.ADD.F.16.N 


Ensemble 3.dd float in<7-nnint half* nparf»«t 

^iiowiiifiv cxuu xiv/CALlilg JJUUll iltXll 1 1 wdi Co L 


E ADD F 16 X 


Ensemble add flnatina-nnint half pvart 


E ADD F 16 Z 


Ensemble add flnatino-noint half* ^ai-/"* 


E ADD F 32 

X^ . ^ 1 1 w *S « X . A» 


Ensemble add flnatino-noint cinol** 


F ADD F 39 P 


i-iibciiioic dua iioaiing-poini single ceiling 


E ADD F 32 F 

J— > . r\jyxy . x . ~s x> . x 


Fn^Pmhlp add fln?»tinf»-.r\/\i«* cinnlA flAAp 

lov^i uuiu ouu xiuciixjig— poiiii biHKic iioor 


E ADD F 32 N 


Fn^PmniP add flnatmcy— notnf cinalp nporact 
x^uoi^iiiui^ auu xiuciixiig— fJUxIJL olllglC IlCalCSl 


E ADD F 32 X 


x^ixdciiiuic auu xnjtiiiiig-poini bingie exact 


F ADD F 32 7 


PJI^PITlnlP 5lHH 1 1 fi nrr _ t">/"\ i r> omnia T<*r/"i 

■i-'iiotiiiL/ic auu iiudLiiiK puini bingie zero 


F ADD F 64 


x^tidciiiuic duu liutiiing-poini uouDie 


F ADD F 64 P 


X.I13CIUU1C auu iiuaiLng-poini aouoie ceiling 


E ADD F 64 F 


Fnsemhle add flnntincr-noint r?ruiKl*» 
x^nov^i i lulls auu xi(j<xixlll&— Utjlxll UUUC/iC uoor 


F ADD F 64 N 


x-iiowinuiw dun inja.iuig~poim uouuie nearest 


E ADD F 64 X 


x^ixdwlliuiv aUU ilUdlxJlg pUlIll UUUOlC CXaCl 


E ADD F 64 Z 

Xw./XX/XV.l . \J~ . £—d 


iviuviiiuiw oxxvj lJ\Jdllll^*UL/LilL UUUDIC ZClO 


E ADD F 128 


Fn<;emhle add floafincr.nr^int nuiri 

i^iiocuiuiw uuu llvjcltiJI^ UU111L LjUdu 


E.ADD.F.128.C 


Ensemble add floatinff-nnint nnad rpitinor 


E.ADD.F.128.F 


Ensemble add floatintr-noint nuad flnnr 


E ADD F 128 N 


Ensemble add floatint^-nnint nnad nparpct 

biuviiiuiv UUU llUOUi 1*1 LVVylllL UUdU Uvdl vdl 


E.ADD.F.128.X 


Ensemble add floating-point quad exact 


E.ADD.F.128.Z 


Ensemble add floating-point quad zero 


E.DIV.F.16 


Ensemble divide floating-point half 


E DIV F 16 C 

X^.X^X T .X . 1 V, V/ 


Ensemble divide flnatint>-nr>int hs*lf r(*\\\na 


E DIV F 1 6 F 

X_* . XV X V .X . 1 \J . X 


Fn^emhle divide firkatino.nrtirit half flru^r 


E DT V F 1 6 N 


x^iiocjuuic uiviuc ixuaiing-poini nan nearest 


P fiTV TT 1 fs Y 

n.iv'i v .r . i o. a. 


Ensemble divide floating-point half exact 


P Pi TV r ifi 7 


ens em oie aiviae tioating-point nali zero 


E.DIV.F.32 


Ensemble divide floating-point single 


E.DIV.F.32.C 


Ensemble divide floating-point single ceiling 


E.DIV.F.32.F 


Ensemble divide floating-point single floor 


E.DIV.F.32.N 


Ensemble divide floating-point single nearest 


E.DIV.F.32.X 


Ensemble divide floating-point single exact 


E.DIVF.32.Z 


Ensemble divide floating-point single zero 


E.DIV.F.64 


Ensemble divide floating-point double 
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E.DIV.F.64.C 


Ensemble divide floating-point double ceiling """1 


E.DIV.F.64.F 


Ensemble divide floating-point double floor 


E.DIV.F.64N 


Ensemble divide floating-point double nearest | 


E.DIV.F.64.X 


Ensemble divide floating-point double exact 


E.DIV.F.64.Z 


Ensemble divide floating-point double zero 


E.DIV.F.128 


Ensemble divide floating-point quad 


E.DIV.F.128.C 


Ensemble divide floating-point quad ceiling 


E.DIV.F.128.F 


Ensemble divide floating-point quad floor 


E.DIV.F.128.N 


Ensemble divide floating-point quad nearest 


E.DIV.F. 128.X 


Ensemble divide floating-point quad exact 


E.DIV.F.128.Z 


Ensemble divide floating-point quad zero 


E.MUL.C.F.16 


Ensemble multiply complex floating-point half 


E.MUL.C.F.32 


Ensemble multiply complex floating-point single 


E.MUL.C.F.64 


Ensemble multiply complex floating-point double 


E.MUL.F.16 


Ensemble multiply floating-point half j 


E.MUL.F.16.C 


Ensemble multiply floating-point half ceiling 


E.MUL.F.16.F 


Ensemble multiply floating-point half floor 


E.MUL.F.16.N 


Ensemble multiply floating-point half nearest ! 


E.MUL.F.16.X 


Ensemble multiply floating-point half exact 


E.MUL.F.16.Z 


Ensemble multiply floating-point half zero 


E.MUL.F.32 


Ensemble multiply floating-point single 


E.MUL.F.32.C 


Ensemble multiply floating-point single ceiling 


E.MUL.F.32.F 


Ensemble multiply floating-point single floor 


E.MUL.F.32.N 


Ensemble multiply floating-point single nearest 


E.MUL.F.32.X 


Ensemble multiply floating-point single exact 


E.MUL.F.32.Z 


Ensemble multiply floating-point single zero 


E.MUL.F.64 


Ensemble multiply floating-point double 


E.MUL.F.64.C 


Ensemble multiply floating-point double ceiling 


E.MUL.F.64.F 


Ensemble multiply floating-point double floor 


E.MUL.F.64.N 


Ensemble multiply floating-point double nearest 


E.MUL.F.64.X 


Ensemble multiply floating-point double exact 


E.MUL.F.64.Z 


Ensemble multiply floating-point double zero 


E.MUL.F.128 


Ensemble multiply floating-point quad 


E.MUL.F.128.C 


Ensemble multiply floating-point quad ceiling 


E.MUL.F.128.F 


Ensemble multiply floating-point quad floor 


E.MUL.F.128.N 


Ensemble multiply floating-point quad nearest 


E.MUL.F.128.X 


Ensemble multiply floating-point quad exact 


E.MUL.F.128.Z 


Ensemble multiply floating-point quad zero ! 



FIG. 30A-2 



Selection 



class 


op 


prec 


round/trap 


add 


EADDF 


16 


32 


64 


128 


NONE C F N X Z 


divide 


EDIVF 


16 


32 


64 


128 


NONE C F N X Z 


multiply 


EMULF 


16 


32 


64 


128 


NONE C F N X Z 


complex multiply 


EMUL.CF 


16 


32 


64 




NONE 



Format 



E.op.prec.round rd=rc,rb 

rd=eopprecround(rc, rb) 

3] 24 23 1817 1211 65 0 

I E.prec I rd | rc | rb | op.round | 
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Definition 

def mul(size,v,i,w j) as 

mul <- fmul(F(size,v s ize-l+i .i),F(size,w s i 2e -l+j. j)) 
enddef 

def EnsembleFloatingPoint(op,prec,round,ra,rb,rc) as 
c <- RegRead(rc, 128) 
b <- RegRead(rb, 128) 
for i <— 0 to 128-prec by prec 



E.MUL.F: 

ai <— fmul(ci,bi) 
E.MUL.C.F: 

if (i and prec) then 

ai <— fadd(mul(prec,c,i,b,i-prec), mul(prec,c,i-prec,b,i)) 



ai <— fsub(mul(prec,c,I,b,I), mul(prec,c,i+prec,b,i+prec)) 



ci <- F(prec,ci+ p rec-i..i) 
bi <r- F(prec,bi+ p rec-l..i) 
case op of 



E.ADD.F: 



ai <— faddr(ci,bi,round) 



else 



endif 
E.DIV.F.: 

ai <— fdiv(ci,bi) 

endcase 

ai+prec-l..i *- PackF(prec, ai, round) 
endfor 



RegWrite(rd, 128, a) 



enddef 




Operation codes 



E.SUB.F.16 


Ensemble subtract floating-point half 1 


E.SUB.F.16.C 


Ensemble subtract floating-point half ceiling 


E.SUB.F.16.F 


Ensemble subtract floating-point half floor 


E.SUB.F.16.N 


Ensemble subtract floating-point half nearest 


E.SUB.F.16.Z 


Ensemble subtract floating-point half zero 


E.SUB.F.16.X 


Ensemble subtract floating-point half exact 


E.SUB.F.32 


Ensemble subtract floating-point single 


E.SUB.F.32.C 


Ensemble subtract floating-point single ceiling 


E.SUB.F.32.F 


Ensemble subtract floating-point single floor 


E.SUB.F.32.N 


Ensemble subtract floating-point single nearest 


E.SUB.F.32.Z 


Ensemble subtract floating-point single zero 


E.SUB.F.32.X 


Ensemble subtract floating-point single exact 


E.SUB.F.64 


Ensemble subtract floating-point double 


E.SUB.F.64.C 


Ensemble subtract floating-point double ceiling 


E.SUB.F.64.F 


Ensemble subtract floating-point double floor 


E.SUB.F.64.N 


Ensemble subtract floating-point double nearest 


E.SUB.F.64.Z 


Ensemble subtract floating-point double zero 


E.SUB.F.64.X 


Ensemble subtract floating-point double exact 


E.SUB.F.128 


Ensemble subtract floating-point quad 


E.SUB.F.128.C 


Ensemble subtract floating-point quad ceiling 


E.SUB.F.128.F 


Ensemble subtract floating-point quad floor 


E.SUB.F.128.N 


Ensemble subtract floating-point quad nearest 


E.SUB.F.128.Z 


Ensemble subtract floating-point quad zero 


E.SUB.F. 128.X 


Ensemble subtract floating-point quad exact 
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Selection 



class 


op 


prec 


round/trap 


set 


SET. 

E LG 
L GE 


16 32 64 128 


NONE X 


subtract 


SUB 


16 32 64 128 


NONE C F N X Z 



Format 



E . o p . prec . round rd=rb , rc 

rd=eopprecround(rb,rc) 

31 24 23 18 17 12 11 65 0 

| E.prec [ rd 1 rc | rb | op.round [ 

8 6 6 6 6 
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Definition 



def EnsembleReversedFloatingPobt(op,prec,round,rd,rc,rb) as 
c <r- RegRead(rc, 128) 
b <- RegRead(rb, 128) 
for i <- 0 to 128-prec by prec 

ci<~F(prec,ci-hprec-l..i) 

bi <r- F(prec,bi+ p rec-l..i) 

ai <— frsubr(ci,-bi, round) 

ai+prec-l..i <- PackF(prec, ai, round) 
endfor 

RegWrite(rd, 128, a) 
enddef 
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Operation codes 



X.COMPRESS.2 


Crossbar compress signed pecks 


X.COMPRESS.4 


Crossbar compress signed nibbles 


X.COMPRESS.8 


Crossbar compress signed bytes 


X.COMPRESS.16 


Crossbar compress signed doublets 


X.COMPRESS.32 


Crossbar compress signed quadlets 


X.COMPRESS.64 


Crossbar compress signed octlets 


X. COMPRESS. 128 


Crossbar compress signed hexlet 


X.COMPRESS.U.2 


Crossbar compress unsigned pecks 


X.COMPRESS.U.4 


Crossbar compress unsigned nibbles 


X.COMPRESS.U.8 


Crossbar compress unsigned bytes 


X.COMPRESS.U. 16 


Crossbar compress unsigned doublets 


X.COMPRESS.U.32 


Crossbar compress unsigned quadlets 


X. COMPRES S . U. 64 


Crossbar compress unsigned octlets 


X.COMPRESS.U. 128 


Crossbar compress unsigned hexlet 


X.EXP AND 2 


Crossbar expand signed pecks "~1 


X.EXP AND.4 


Crossbar expand signed nibbles 


X.EXP AND.8 


Crossbar expand signed bytes 


X.EXPAND.16 


Crossbar expand signed doublets j 


X.EXP AND 32 


Crossbar expand signed quadlets 


X.EXP AND.64 


Crossbar expand signed octlets 


X.EXPAND.128 


Crossbar expand signed hexlet 


X.EXP AND.U.2 


Crossbar expand unsigned pecks 


X.EXP AND. U.4 


Crossbar expand unsigned nibbles 


-\T r> 'V^T» A vrr\ T T a 

X.EXP AND.U.8 


Crossbar expand unsigned bytes 


X.EXP AND.U. 16 


Crossbar expand unsigned doublets 


A.fc,ArAND.U.32 


Crossbar expand unsigned quadlets 


A.tiAFAND. U.64 


Crossbar expand unsigned octlets 


V" 17 YD A \TH T T no 

A-tlArAINU. U. lZo 


Crossbar expand unsigned hexlet 


Y O/^TT O 

A.KlJ IL.z 


Crossbar rotate left pecks 




Crossbar rotate left rubbles 


Y D ATT 9 


Crossbar rotate left bytes 


Y PHTT 1 A 
A..1\AJ lL.io 


Crossbar rotate left doublets 




Crossbar rotate left quadlets 


X.ROTL.64 


Crossbar rotate left octlets 


XROTL.128 | 


Crossbar rotate left hexlet 


X.ROTR.2 


Crossbar rotate right pecks 


X.R0TR.4 


Crossbar rotate right nibbles 


X.ROTR8 


Crossbar rotate right bytes 


XR0TR.16 


Crossbar rotate right doublets 
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X.ROTR.32 


Crossbar rotate right quadlets 


XROTR.64 


Crossbar rotate right octlets 


X.ROTR.128 


Crossbar rotate right hexlet 


X.SHL.2 


Crossbar shift left pecks 


X.SHL.2.0 


Crossbar shift left signed pecks check overflow 


X.SHL.4 


Crossbar shift left nibbles 


X.SHLAO 


Crossbar shift left signed nibbles check overflow 


X.SHL.8 


Crossbar shift left bytes 


X.SHL.8.0 


Crossbar shift left signed bytes check overflow 


X.SHL.16 


Crossbar shift left doublets 


X.SHL.16.0 


Crossbar shift left signed doublets check overflow 


X.SHL.32 


Crossbar shift left quadlets 


X.SHL.32.0 


Crossbar shift left signed quadlets check overflow 


X.SHL.64 


Crossbar shift left octlets 


X.SHL.64.0 


Crossbar shift left signed octlets check overflow 


X.SHL.128 


Crossbar shift left hexlet 


X.SHL.128.0 


Crossbar shift left signed hexlet check overflow 


X.SHL.U.2.0 


Crossbar shift left unsigned pecks check overflow 


X.SHL.U.4.0 


Crossbar shift left unsigned nibbles check overflow 


X.SHL.U.80 


Crossbar shift left unsigned bytes check overflow 


X.SHL.U.16.0 


Crossbar shift left unsigned doublets check overflow 


X.SHL.U.32.0 


Crossbar shift left unsigned quadlets check overflow 


X.SHL.U.64.0 


Crossbar shift left unsigned octlets check overflow 


X.SHL.U.128.0 


Crossbar shift left unsigned hexlet check overflow 


X.SHR.2 


Crossbar signed shift right pecks 


X.SHR.4 


Crossbar signed shift right nibbles 


X.SHR.8 


Crossbar signed shift right bytes 


X.SHR.16 


Crossbar signed shift right doublets 


X.SHR.32 


Crossbar signed shift right quadlets 


X.SHR.64 


Crossbar signed shift right octlets 


X.SHR.128 


Crossbar signed shift right hexlet 


X.SHR.U.2 


Crossbar shift right unsigned pecks 


X.SHR.U.4 


Crossbar shift right unsigned nibbles 


X.SHR.U.8 


Crossbar shift right unsigned bytes 


X.SHR.U.16 


Crossbar shift right unsigned doublets 


X.SHR.U.32 


Crossbar shift right unsigned quadlets 


X.SHR.U.64 


Crossbar shift right unsigned octlets 


XSHR.U.128 


Crossbar shift right unsigned hexlet 
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Selection 



class 


op 


size 


precision 


EXPAND EXPAND. U 
COMPRESS COMPRESS.U 


2 4 8 16 32 64 128 


shift 


ROTR ROTL SHR SHL 
SHL.O SHL.U.OSHR.U 


2 4 8 16 32 64 128 



Format 

X.op.size rd=rc,rb 
rd=xopsize(rc,rb) 

31 2 24 23 18 17 12 11 6 5 1 

| XSHIFT |s | rd | rc | rb 1 op "TIT] 

7 1 6 6 6 4 2 

lsize +- Iog(size) 
s <- Isize2 
sz «- lsize i..o 



FIG. 32B 



Definition 

def Crossbar(op,size,rd,rc,rb) 
c <- RegRead(rc, 128) 
b <- RegRead(rb, 128) 
shift <- b and (size-1) 
case op5. 2 It 0 2 of 

X. COMPRESS: 

hsize <r- size/2 

for i <— 0 to 64-hsize by hsize 
if shift < hsize then 

ai+hsize-l .i «- ci+i+shift+hsize-1. J+i+shift 

else 

ai+hsize-l..i <- cf^jjgff || ci+i+size-i.j+i+shift 

endif 

endfor 

ai27..64^-0 
X.COMPRESS.U: 

hsize <— size/2 

for i <— 0 to 64-hsize by hsize 
if shift < hsize then 

ai+hsize-L.i <~ ci+i+shift+hsize-1 J+i+shift 

else 

ai+hsize-L.i +- oshift-hsize || ci+i+ s i 2e . l j+i+shift 

endif 

endfor 

ai27..64<-0 
X.EXP AND: 

hsize <- size/2 

for i <— 0 to 64-hsize by hsize 
if shift < hsize then 

ai+i+size-l..i+i <- || ci+ h size-l..i II 0 shift 

else 

ai+i+size-l..i+i <- Ci+si^hift.! J || oshift 

endif 

endfor 



FIG. 32C-1 



X.EXP AND.U: 

hsize <— size/2 

for i <- 0 to 64-hsize by hsize 
if shift < hsize then 

ai+i+size-l..i+i <- O h size-shift y q +hsize-1 ; || oshift 

else 

ai+i+size-l..i+i «- q+size.shift.j J || oshift 

endif 

endfor 
X.ROTL: 

for i <r- 0 to 128-size by size 

ai+size-l..i <- ci+size-1 -shift.. i II ci+ s ize-l..i+size-l -shift 

endfor 

X.ROTR: 

for i <- 0 to 128-size by size 

ai+size-l..i <~ ci+ s hift-l..i II ci+size-l..i+shift 
endfor 
X.SHL: 

for i <r- 0 to 128-size by size 

ai+size-l..i <- ci+size-1 -shift., i II 0 shift 
endfor 
X.SHL.O: 

for i <- 0 to 128-size by size 

if ci+size-l..i+size-l-shift * cfj*j{^] 1 . s hift then 
raise FixedPointArithmetic 

endif 

ai+size-l..i «- ci+size-i -shift.. ill 0 shift 
endfor 



FIG. 32C-2 



X.SHL.U.O: 

for i <- 0 to 128-size by size 

if ci+size-1 .i+size-shift * 0 shlft then 
raise FixedPointArithmetic 

endif 

ai+size-l..i <- ci+ s ize-i -shift., ill 0 shift 
endfor 
X.SHR: 

for i <— 0 to 128-size by size 

ai+size-l..i <- c?£size-l II c i+size-l..i+shift 
endfor 
X.SHR.U: 

for i 0 to 128-size by size 

ai+size-l..i <- 0 shift || q+size.! i+ shi ft 
endfor 

endcase 

RegWrite(rd, 128, a) 
enddef 



F/G. 32C -3 



Compress 32 bits to 16, with 4-bit right shift 



FIG. 32D 



Format 

X.EXTRACT ra=rd,rc,rb 

ra=xextract(rd, rc, rb) 

3] 24 23 18 17 12 11 65 0 

I °P I rd | rc 1 rb | ra 

_ _ _ _ 



FIG. 33A 



Definition 



def CrossbarExtract(op,ra,rb,rc,rd) as 
d <r- RegRead(rd, 128) 
c <- RegRead(rc, 128) 
b <- RegRead(rb, 128) 
case b8 .0 of 
0..255: 



256. 


gsize 
.383: 


^ no 

<— 128 


384. 


gsize 
.447: 


<- 64 


448. 


gsize 
.479: 


<-32 


480. 


gsize 
.495: 


<- 16 


496. 


gsize 
.503: 


<- 8 


504. 


gsize 
.507: 


<- 4 


508. 


gsize 
.511: 


<-2 




gsize 


<- 1 



endcase 
m <r- b\2 

as <- signed <- b\4 
h <— (2-m) * gsize 

spos 4- (b8..o) and ((2-m)*gsize-l) 
dpos <r- (0 || D23..16) and (gsize- 1) 
sfsize <r- (0 || b3i..24) and (gsize-1) 

tfsize <- (sfsize = 0) or ((sfsize+dpos) > gsize) ? gsize-dpos : sfsize 
fsize <- (tfsize + spos > h) ? h - spos : tfsize 
for i <r- 0 to 128-gsize by gsize 
case op of 

X.EXTRACT: 
if m then 

P <~ dgsize+i-l..i 

else 

P<-(d|| c)2*(g S ize+i)-1..2*i 

endif 

endcase 

v«-(as&ph-l)||p 

w 4- (as & v spos +fsize.l)S size - fsize ^Pos || vf s i 2e -i+spos. .spos II 0 d P os 
if m then 

a s ize-l+i..i <~ c gsize-i+i..dpos+fsize+i II wdpos+ fsize- 1.. dpos II cdpos-1+l..i 

else 

a s ize«l+i..i <~ w 

endif 
endfor 

RegWrite(ra, 128, a) 
enddef 
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| s,t 






rd 1 


2* gsize \ \ 






gsize \ 


— ► 








+~ 




s 


ab 


0 






fsize 
► 


dpos 



rc||rb 



Crossbar extract 
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fsize spos 
» ^ — 



s.t 



rd 



rc 



gsize 1 



rb 



rd 



fsize 



Crossbar merge extract 
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X.SHUFFLE.4 


Crossbar shuffle within pecks 


X. SHUFFLE. 8 


Crossbar shuffle within bytes 


X. SHUFFLE. 16 


Crossbar shuffle within doublets 


X.SHUFFLE.32 


Crossbar shuffle within quadlets 


X.SHUFFLE.64 


Crossbar shuffle within octlets 


X. SHUFFLE. 128 


Crossbar shuffle within hexlet 


X.SHUFFLE.256 


Crossbar shuffle within triclet 



FIG. 34A 



Format 



X.SHUFFLE.256 rd=rc,rb,v,w,h 
X. SHUFFLE. size rd=rcb,v,w 

rd=xshuffle256(rc,rb,v,w,h) 
rd=xshufflesize(rcb,v,w) 

31 24 23 18 17 12 11 65 0 

| X.SHUFFLE 1 rd | rc | rb | op 

8 6 6 6 6 

rc <— rb <— rcb 
x<~log2(size) 
y<-log2(v) 
z<-log2(w) 

op <- ((x*x*x-3*x*x-4*x)/6-(z*z-z)/2+x*z+y) + (size=256)*(h*32-56) 



FIG. 34B 



Definition 



def CrossbarShuffle(major,rd,rc,rb,op) 
c <- RegRead(rc, 128) 
b <- RegRead(rb, 128) 
if rc=rb then 
case op of 
0..55: 

for x <- 2 to 7; for y <- 0 to x-2; for z <- 1 to x-y-1 

if op = ((x*x*x-3*x*x-4*x)/6-(z*z-z)/2+x*z+y) then 
fori<-0tol27 

a i <~ c (i6..x II iy+z-l..y II ix-l..y+z II iy-1 0) 

end 

endif 

endfor; endfor; endfor 

56..63: 

raise Reservedlnstruction 

endcase 

elseif 

case op4 of 
0..27: 

Cb <r- C || b 
X <r- 8 

h <- op5 

for y <- 0 to x-2; for z <- 1 to x-y-1 

if op4„o = ((17*z-z*z)/2-8+y) then 
for i <- h*128 to 127+h*128 

ai-h*128 <- cb (iyJhzA y || ix . L . y+2 || J . Q) 

end 

endif 
endfor; endfor 
28.31: 

raise Reservedlnstruction 

endcase 

endif 

RegWrite(rd, 128, a) 
enddef 
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Figure 35A 



Wide Solve Galois 



qalpolv 1 *galpoly ] solv par 1 wsolv g 1 



[ 



wminor 



8 6 6 6 6 

Solves L*S = W mod z**8 in 8 iterations 



=0? 

Iteration 
control 
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conditional 
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theta 



Figure 35B Wide Solve Galois 

static v8_t wsolveg(v8_t hh, v8_t syndrome, v8_t *omega) 

for ( r=0; r < N_PARITY; r++) /*: A + 1 6*(B+A):*/ 

{ delta = _xcopyi8(delta0,0); I*' 1 6*X :*/ 

deltaOs = _castv8(_xshrm1 28(_castv1 28(delta0),_castv1 28(delta1 ),8)); /*: 1 6*X :*/ 

deltals = _reindex8(delta1, -1); f*' 16*X :*/ 

deitaO = _gxor8(_emulg8(gamma f deltaOs, hh),_emulg8(delta,theta0, hh)); /*: 16*(2*E+G) :*/ 

deltal = _gxor8<_emulg8(gamma, deltals, hh),_emulg8(delta,theta1, hh)); /*: 16*(2*E+G) :*/ 

s = _gsetandne8(delta, _gsetge8(k,_gzero8)); /*: 16*2*G :*/ 

thetaO = _gmux8(s,delta0s,theta0); /*= 16*G :*/ 

thetal = _gmux8(s,delta1s,theta1); /*: 16*G :*/ 

gamma = _gmux8(s,delta,gamma); I*'- 16*G :*/ 

k = _gmux8(s,_gnot8(k),_gadd8(k,_gone8)); /*: 16*3*G :*/ 

lambda = _xselect8(delta1,deltaO,USE - VCONST(lambdai)); /*: X :*/ 
*omega = _castv8(_xwithdrawu128(_castv128(delta0),64,0)); 



/*: X :*/ 
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Wide FFT Slice 



wminor 



8 



6 



data I *twiddl ffipar 1 wffislic 
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Wide Cache Strip 




Butterfly / Mux Strip 



Butterfly / Mux Strip 



Butterfly / Mux Strip 



6 



Coefficient 
RAM/ROM 



> — fe n 



Coefficient 
RAM/ROM 



Wide Cache Strip 



Coefficient 
RAM/ROM 



Coefficient 
RAM/ROM 



/******************************************* 

/* DSP library module : Inverse FFT, selectable length, */ 

/* 16-bit complex integers, */ 

/* split-radix algorithm */ 

/* */ 

/************^****^*************^^ 

/* include files */ 
#include <stdio.h> 
#include "broadmx.h" 
#include "affirrn.rT 
#include "dspFFTud.h" 
#include <math.h> 

#define SHOW 0 



/* typed version of gboolean: should be part of gops */ 

static INLINE vl6__t _gboo!eanl6(vl6_t srcl, vl6_t src2, vl6_t src3, int imm) 
return _gboolean(srcl.rr, src2.rr, src3.rr, imm).vl6; 

} 

/* - - - - 

* I * (a - b) / 2 
*/ 

static inline vc 1 6_t _sub_mul_by_i_c 1 6(vc 1 6_t aa, vc 1 6_t bb) 
{ 

v 1 6_t muxmask = castv 1 6(_gcopyi32(0xFFFF)); 

vl6_t xx; 

/* xx = _gsubhl6n(_gmuxl6(muxmask,aa,bb),_gmuxl6(muxmask,bb,aa)); */ 
xx = _gsubhl6n(_gxorl6(muxmask,bb),_gxorl6(muxmask,aa)); 
xx = _xswizzlel6(xx, 7, 1); 
return xx; 

} 

Fig. 36B 



/* 

* Perform 4 independent 4-point fft's 
* 

* x0..x3 holds the input to the transform, 4 sets of 4 complex numbers. 

* Each set is inverse- fourier transformed independently of the others. 

* The results appear in x0..x3. The original values of y0..y3 are corrupted. 

*/ 

#define QUADJFFT_4PT_cl6C_yO,_y I,_y2,_y3, _x0,_xl,_x2,_x3) {\ 
_y0 = _gaddh 1 6n(_x0,_x2); \ 
_y 1 = _jgaddh 1 6n(_x 1 ,_x3); \ 
_y2 = _gsubhl6n(_x0,_x2); \ 
_y3 = _sub_mul_by_i_c 1 6(_x 1 ,_x3); \ 
_x0 = ^gaddh 1 6n(_y0,_y 1 ); \ 
_x2 = _gsubh 1 6n(_y0,_y 1 ); \ 
_xl =_gaddhl6nC_y2,_y3); \ 
_x3 = _gsubhl6n(_y2,_y3); \ 

} 

/* - - 

* Perform 4 independent 2-point fft's 
* 

* x0..xl holds the input to the transform, 4 sets of 2 complex numbers. 

* Each set is inverse-fourier transformed independently of the others. 

* The results appear in y0..yl. 
*/ 

tfdefine QUAD_IFFT_2PT_cl6(_yO,_jyl, _xO,_xl) {\ 
_y0 = _gaddh 1 6nt_x0,_x 1 ); \ 
_y 1 = _jgsubhl6n(_x0,_xl); \ 

} 



Fig. 36B (cont) 



static int _wfftslicecl6(vcl6_t *dp, vcl6_t *tp, int dn, int ds, int tn, int radix, int reorder, int extract) 

int i j,ii, logmost; 
vcl6_t *dwp, *twp; 

vcl6_t t0,tl,t2,t3, d0,dl,d2,d3, P 0,pl,p2,p3, zO,zl 3 z2,z3, m, n; 

if(SHOW) printf("extract = %d\n",extract&Oxf); 
n = m = _gcopyil6(0); 
if (radix==4) { 
if (ds==l) { 

for (twp=tp,i=0; i<tn; dp++,twp++,i+=NELEMC16) { 
tO = twp[0]; 
dO = dp[0]; 

pO = _emulxl6(t0,d0,extract); 

z0 = _xshril6(p0,l); 

n = _gbooleanl6(n,p0,z0,0xf6); 

dO = _vput 1 6(d0,0,(_vget 1 6(p0,0)+_vget 1 6(p0,2)+_vget 1 6(p0,4)+_vget 1 6(p0,6)+2)»2); 
dO = vput 1 6(d0, 1 ,C_ v get 1 6(p0, 1 )+_vget 1 6(p0,3)+_vget 1 6(p0,5 )+_vget 1 6(p0,7)+2)»2); 
dO = vput 1 6(d0,4,(_vget 1 6(p0,0)-_vget 1 6(p0,2)+_vget 1 6(p0,4)-_vget 1 6(p0,6)+2)»2); 
dO = vput 1 6(d0,5,(_vget 1 6(p0, 1 )-_vget 1 6(p0,3)+_vget 1 6(p0,5)-_vget 1 6(p0,7)+2)»2); 
dO = _vput 1 6(dO,2,Cvget 1 6(p0,0)-_vget 1 6(p0,3 )-_vget 1 6(p0,4)+_vget 1 6(p0,7)+2)»2); 
dO = _vput 1 6(d0,3 ,(_vget 1 6(pO, 1 )+_vget 1 6(p0,2)-_vget 1 6(p0,5)-_vget 1 6(p0,6)+2)»2); 
dO = _vput 1 6(d0,6,(_vget 1 6(p0,0)+_vget 1 6(p0,3)-_vget 1 6(p0,4)-_vget 1 6(p0,7)+2)»2); 
dO - _vput 1 6(d0,7,(_vget 1 6(p0, 1 )-_vget 1 6(p0,2)-_vget 1 6(pO, 5)+_vget 1 6(p0,6)+2)»2); 
zO = _xshril6(dO,l); 
m = _gbooleanl6(m,d0,z0,0xf6); 
dp[0] = dO; 

} 

} else { 
ii = ds / NELEMC16; 

for (twp=tp,i=0; i<tn; dp++,twp++,i+=4*NELEMC16) { 
tO = twp[0*ii]; 
tl = twp[l*ii]; 
t2 = twp[2*ii]; 
t3 =twp[3*ii]; 

for (dwp=dpj=0; j<dn; dwp+=4*iij+=4*ds) { 



dO 


= dwp[0*ii]; 


dl 


= dwp[l*ii]; 


d2 


= dwp[2*ii]; 


d3 


= dwp[3*ii]; 


dO 


= _emulxl6(t0,d0,extract); // can be eextract 


di 


= _emulx 1 6(t 1 ,d 1 , extract); 


d2 


= _emulxl6(t2,d2,extract); 


d3 


= emulxl6(t3,d3,extract); 


zO 


= xshril6(d0,l); 


zl 


= xshril6(dl,l); 


z2 


= xshril6(d2,l); 


z3 


= _xshril6(d3,l); 


n = 


_gboolean 1 6(n,d0,z0,0xf6); 


n = 


gboolean 1 6(n,d 1 ,z 1 ,0xf6); 


n = 


_gbooleanl6(n,d2,z2,0xf6); 


n = 


_gboolean 1 6(n,d3 ,z3 ,0xf6); 
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QU AD_IFFT_4PT_c 1 6(pO,p 1 ,p2,p3, dO,d 1 ,d2,d3); 

z0=_xshril6(d0,l); 

zl =_xshril6(dl,l); 

z2=_xshril6(d2,l); 

z3 =_xshril6(d3,l); 

m = _gbooleanl6(m,d0,z0,0xf6); 

m = _gbooleanl6(m,dl,zl,0xf6); 

m = _gbooleanl6(m,d2,z2,0xf6); 

m = _gbooleanl6(m,d3,z3,0xf6); 

dwp[0*ii] = dO; 

dwp[l*ii]=dl; 

dwp[2*ii] = d2; 

dwp[3*ii]=d3; 

} 

} 

} 

} else if (radix==2) { 
ii = ds / NELEMC16; 

for (twp=tp,i=0; i<tn; dp++,twp++,i+==2*NELEMC16) { 
tO = twp[0*ii]; 
tl = twp[l*ii]; 

for (dwp=dp,j=0; j<dn; dwp+=2*iij+=2*ds) { 
dO = dwp[0*ii]; 
dl =dwp[l*ii]; 

pO = emulxl6(tO,dO,extract); // can be eextract 

pi = _emulxl6(tl,dl, extract); 

zO =_xshri!6(p0,l); 

zl =_xshril6(pl,l); 

n = _gbooleanl6(n,p0,z0,0xf6); 

n = _gbooleanl6(n,pl,zl,0xf6); 

QUAD_IFFT_2PT_c 1 6(d0,d 1 , pO,p 1 ); 

z0 = _xshril6(d0,l); 

zl = _xshril6(dl,l); 

m = _gbooleanl6(m,d0,z0,0xf6); 

m = _gbooleanl6(m,dl,zl,0xf6); 

dwp[0*ii] = dO; 

dwp[l*ii] = dl; 

} 

} 

} else { 

for (j=0; j<dn; dp++,tp++j+=NELEMC16) { 
♦dp = dO = *tp; 
z0 = _xshril6(d0,l); 
m = _gbooleanl6(m,d0,z0,0xf6); 

} 

n = m; 

} 

Fig. 36B (cont) 



n = _gorl6(n,_castvl6(_xshriul28(_castvl28(n),64))); 

n = _ g orl6(n,_castvl6Cxshriul28(_castvl28(n),32))); 

n = _ g orl6(n,_castvl6Cxshriul28(_castvl28(n),16))); 

logmost = _vgetl6(_elogmostl6(n),0); 

if(SHOW) printf("logmost = %d (after mulx)\n", logmost); 

m = _gorl6(m,_castvl6(_xshriul28(_castvl28(m),64))); 

m - _gorl6(m,_castvl6(_xshriul28(_castvl28(m),32))); 

m = _gorl6(m,_castvl6(_xshriul28(_castvl28(m),16))); 

logmost = _vgetl6C_e logmost 16(m),0); 

if(SHOW) printf("logmost = %d (after addh)\n", logmost); 

return logmost; 

} 

static cplxil6 const exptab[][4] = 
#define IFFT_COEFS_16 
^include "dspIFFT-coefs.h" 
#undef IFFT COEFS 16 



static void make_twiddle(cplxil6 *tw, int ni, int nj, int len, int show) 
{ 

int iijj; 

for(ii = 0; ii < ni; ++ii) { 

forOj=0; jj<nj; ++U) { 

tw->re = rint(-32768*cos(2*M_PI/len*ii*jj)); 
tw->im = rint(-32768*sin(2*M_PI/len*ii*jj)); 

if(show) printfC'twiddle^/odl^/od] = (%7d,%7d)\n M , ii, jj, tw->re, tw->im); 
++tw; 

} 

} 

} 

int dspInverseFourier_slice_cl6(cplxil6 *out, cplxil6 const *in, int len) 
{ 

int logmost, extract, scale; 
static cplxi!6 twidtab[12][1024]; 
int i, j, k, 1; 
int ds, tn; 

for(i = 0; i < len; ++i) { 

twidtab[0][i].re = -32768; 
twidtab[0][i].im = 0; 

} 

make_twiddle(&twidtab[l][0], 4, 4, 16, 0); 
make_twiddle(&twidtab[2][0], 4, 16, 64, 0); 
make_twiddle(&twidtab[3][0], 4, 64, 256, 0); 
make_twiddle(&twidtab[4][0], 2, 256, 512, 0); 
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scale = 0; 
logmost = 0; 
if(len — 4) { 

logmost - _wfftslicecl6((vc!6_t *)out, (vcl6_t *)in, len, 0, 0, 1, 0, 0); 
scale = 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (5 12-4* 16+logmost+l); 
logmost - _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[0], len, 1, len, 4, 0, extract); 
} else if(len= 16) { 

logmost - _wfftslicecl6((vcl6_t *)out, (vcl6_t *)in, len, 0, 0, 1, 0, 0); 
scale = 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (5 12-4*1 6+logmost+l); 

logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[0], len, 1, len, 4, 0, extract); 

scale += 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (5 12-4* 16+logmost+l); 
logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[l], len, 4, 16, 4, 0, extract); 
} else if(len = 64) { 

logmost - _wfftslicecl6((vcl6_t *)out, (vcl6_t *)in, len, 0, 0, 1, 0, 0); 
scale = 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (5 12-4* 16+logmost+l); 

logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[0], len, 1, len, 4, 0, extract); 

scale += 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (512-4* 16+logmost+l); 

logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[l], len, 4, 16, 4, 0, extract); 

scale += 16 - logmost; 

extract = ( 1 « 1 4) + ( 1 « 1 3) + (2«9) +(512-4*1 6+logmost+ 1 ); 
logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[2], len, 16, 64, 4, 0, extract); 
scale -= 2; 
} else if(len == 256) { 

logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)in, len, 0, 0, 1, 0, 0); 
scale = 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (512-4*16+logmost+l); 

logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[0], len, 1, len, 4, 0, extract); 

scale += 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (512-4* 16+logmost+l); 

logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[l], len, 4, 16, 4, 0, extract); 

scale += 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (5 12-4* 16+logmost+l); 

logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[2], len, 16, 64, 4, 0, extract); 

scale += 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (512-4* 16+logmost+l); 

logmost = _wfftslicecl6((vcl6_t *)out, (vc!6_t *)twidtab[3], len, 64, 256, 4, 0, extract); 
scale -= 4; 
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} else if(len ==512) { 

logmost - _wfftslicecl6((vcl6_t *)out, (vcl6_t *)in, len, 0, 0, 1, 0, 0); 
scale = 16 - logmost; 

extract = (1«14) + (l«13) + (2«9) + (5 12-4*1 6+logmost+l); 

logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[0], len, I, len, 4, 0, extract); 

scale += 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (512-4* 1 6+logmost+l); 

logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[l], len, 4, 16, 4, 0, extract); 

scale += 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (5 12-4*1 6+logmost+l); 

logmost = _wfftslicecl6((vcl6j *)out, (vcl6_t *)twidtab[2], len, 16, 64, 4, 0, extract); 

scale += 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (512-4* 1 6+logmost+l); 

logmost = _wfftslicecl6((vcl6_t *)out, (vcl6_t *)twidtab[3], len, 64, 256, 4, 0, extract); 
scale += 16 - logmost; 

extract = (1«14) + (1«13) + (2«9) + (5 12-4*1 6+logmost+l); 

logmost = _wfftslicecl6((vcl6_t *)out, (vcl6j *)twidtab[4], len, 256, 512, 2, 0, extract); 
scale -= 7; 

} 

if(SHOW) printf( M scale = %d\n n ,scale); 
return scale; 
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Format 

W.CONVOLVE.X.order ra=rc,rd,rb 
ra=wop(rc,rd,rb) 

31 2423 1817 12U 65 0 

| W.op | rd | rc | rb | ra 1 

8 6 6 6 6 
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Definition 



def muKsize^vSjV^ws^w j) as 

mul <- ((vs&v size . 1+i )^size || v size . 1+ u) * ((ws&w size . 1+j )h-size y w size . l+j .j) 
enddef 

def WideConvolveExtract(op,ra,rb,rc,rd) 
d <r~ RegRead(rd, 64) 
c <— RegRead(rc, 64) 
b <r- RegRead(rb, 128) 
case b8„o °f 
0..255: 

sgsize <— 128 
256..383: 

sgsize <- 64 
384..447: 

sgsize <— 32 
448..479: 

sgsize ^- 16 
480..495: 

sgsize <— 8 
496..503: 

sgsize <— 4 
504..507: 

sgsize <— 2 
508..511: 

sgsize <r- 1 

endcase 
l^b,, 
m <r~ b] 2 
n<-bi3 
signed <r- b\4 
x<-bi5 

^( c 2..0^0)or(d2..o^0) then 
raise Reservedlnstruction 

endif 

cwsize 4- (c and (0-c)) || 0 5 
ct c and (c-1) 
cmsize <- (ct and (O-ct)) || 0 4 
ca <- ct and (ct-1) 
Icmsize <— log(cmsize) 
lewsize <— log(cwsize) 
cm <— LoadMemory(c,ca,cmsize,order) 
dwsize <r- (d and (0-d)) || 0 5 
dt<-dand(d-l) 
dmsize «- (dt and (0-dt)) || 0 4 
da«-dt and (dt- 1 ) 
ldmsize <— log(dmsize) 
Idwsize 4r- log(dwsize) 
dm <- LoadMemory(d,da,dmsize 5 order) 
if (sgsize < 8) or (sgsize > wsize/2) then 
raise Reservedlnstruction 



endif 

gsize <— sgsize 
lgsize <r- log(gsize) 
case op of 

W.CONVOLVE.X.B: 

order <— B 
W.CONVOLVE.X.L: 
order <- L 

endcase 

cs signed 

ds <— signed A m 

zs <— signed or m or n 

zsize <- gsize*(x+l) 

h <— (2*gsize) + ldmsize - lgsize 

spos <- (b8..o) and (2*gsize-l) 

dpos <- (0 || D23..I6) and (zsize-1) 

r <— spos 

sfsize <- (0 || D31 . 24) anc * (zsize-1) 

tfsize <- (sfsize = 0) or ((sfsize+dpos) > zsize) ? zsize-dpos : sfsize 
fsize <- (tfsize + spos > h+1) ? h+1 - spos : tfsize 
if (bio..9 = Z) and not zs then 
rnd <— F 

else 

rnd <— bjo..9 

endif 

mzero <— D95..64 
mpos <- b63..32 
00 «- mpos || 0 3 
ox <— oOf cws i ze _i ..lgsize 
°y «~ °°Icmsize-l..lcwsize 
zz <- (-mzero) || l 3 
zx <— zzid ws i ze -i ..lgsize 
zy <— zzidmsize-i. idwsize 
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for k <r- 0 to 128-zsize by zsize 
i <— k*gsize/zsize 
ix <— Mcwsize-L.lgsize 
iy <— Mcmsize-l..lcwsize 
q[0] <- 0 h 

for j <- 0 to dmsize-gsize by gsize 
jj <r- n and jig S i Z e and not ii gs i ze 
jx <r- jldwsize-L.lgsize 
jy jldmsize-l..ldwsize 

u <- (oy+iy-jy)icmsize-lcwsize-1..0 II (ox+ix-jx-2*jj)i cms i ze .| C wsize-1..0 II 0 lg; 
if (jx>zx) or (jy >z y) and ( dm lgsize-l+j..j° ) and undefined then 
qU+gsize] <- q[j] 

else 

if jj then 

qU+gsize] «- q[j] - mul(gsize,h,cs,cm,u,ds,dm j) 

else 

q[j+gsize] <- q[j] + mul(gsize,h,cs,cm,u,ds,dm j) 

endif 

endif 
endfor 

p <— q[dmsize] 
case rnd of 

none, N: 

S^0h-r||~p r ||~pr-l 

Z: 

s^O h - r ||pfi-i 

F: 

s<-0 h 

C: 

s <— 0 h " r || l r 

endcase 

v^((zs&p h -i)||p) + (0||s) 

if (v h ..r+fsize = (™ & v r+fs ize-l) h+, " r " fsize ) or not 1 then 

w ^ (zs & v^fsize.O 251 ^-^^-^ 05 li v fsize .K r ..r II 0 d P os 

else 

w <r- (zs ? ( v g size - fsize - d P 0S " h 1 1 |~v^ size " 1 ) : o zsize_fsize " d P° s ||l fsize ) || 0 d P os 

endif 

Zzsize-Kk. k <- w 
endfor 

RegWrite(ra, 128, z) 
enddef 

Fig. 37B 
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